diff --git a/tools/uvr5/lib/lib_v5/dataset.py b/tools/uvr5/lib/lib_v5/dataset.py
new file mode 100644
index 0000000..cfd01a1
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/dataset.py
@@ -0,0 +1,183 @@
+import os
+import random
+
+import numpy as np
+import torch
+import torch.utils.data
+from tqdm import tqdm
+
+from . import spec_utils
+
+
+class VocalRemoverValidationSet(torch.utils.data.Dataset):
+ def __init__(self, patch_list):
+ self.patch_list = patch_list
+
+ def __len__(self):
+ return len(self.patch_list)
+
+ def __getitem__(self, idx):
+ path = self.patch_list[idx]
+ data = np.load(path)
+
+ X, y = data["X"], data["y"]
+
+ X_mag = np.abs(X)
+ y_mag = np.abs(y)
+
+ return X_mag, y_mag
+
+
+def make_pair(mix_dir, inst_dir):
+ input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
+
+ X_list = sorted(
+ [
+ os.path.join(mix_dir, fname)
+ for fname in os.listdir(mix_dir)
+ if os.path.splitext(fname)[1] in input_exts
+ ]
+ )
+ y_list = sorted(
+ [
+ os.path.join(inst_dir, fname)
+ for fname in os.listdir(inst_dir)
+ if os.path.splitext(fname)[1] in input_exts
+ ]
+ )
+
+ filelist = list(zip(X_list, y_list))
+
+ return filelist
+
+
+def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
+ if split_mode == "random":
+ filelist = make_pair(
+ os.path.join(dataset_dir, "mixtures"),
+ os.path.join(dataset_dir, "instruments"),
+ )
+
+ random.shuffle(filelist)
+
+ if len(val_filelist) == 0:
+ val_size = int(len(filelist) * val_rate)
+ train_filelist = filelist[:-val_size]
+ val_filelist = filelist[-val_size:]
+ else:
+ train_filelist = [
+ pair for pair in filelist if list(pair) not in val_filelist
+ ]
+ elif split_mode == "subdirs":
+ if len(val_filelist) != 0:
+ raise ValueError(
+ "The `val_filelist` option is not available in `subdirs` mode"
+ )
+
+ train_filelist = make_pair(
+ os.path.join(dataset_dir, "training/mixtures"),
+ os.path.join(dataset_dir, "training/instruments"),
+ )
+
+ val_filelist = make_pair(
+ os.path.join(dataset_dir, "validation/mixtures"),
+ os.path.join(dataset_dir, "validation/instruments"),
+ )
+
+ return train_filelist, val_filelist
+
+
+def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
+ perm = np.random.permutation(len(X))
+ for i, idx in enumerate(tqdm(perm)):
+ if np.random.uniform() < reduction_rate:
+ y[idx] = spec_utils.reduce_vocal_aggressively(
+ X[idx], y[idx], reduction_mask
+ )
+
+ if np.random.uniform() < 0.5:
+ # swap channel
+ X[idx] = X[idx, ::-1]
+ y[idx] = y[idx, ::-1]
+ if np.random.uniform() < 0.02:
+ # mono
+ X[idx] = X[idx].mean(axis=0, keepdims=True)
+ y[idx] = y[idx].mean(axis=0, keepdims=True)
+ if np.random.uniform() < 0.02:
+ # inst
+ X[idx] = y[idx]
+
+ if np.random.uniform() < mixup_rate and i < len(perm) - 1:
+ lam = np.random.beta(mixup_alpha, mixup_alpha)
+ X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
+ y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
+
+ return X, y
+
+
+def make_padding(width, cropsize, offset):
+ left = offset
+ roi_size = cropsize - left * 2
+ if roi_size == 0:
+ roi_size = cropsize
+ right = roi_size - (width % roi_size) + left
+
+ return left, right, roi_size
+
+
+def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
+ len_dataset = patches * len(filelist)
+
+ X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+ y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
+ X, y = X / coef, y / coef
+
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
+
+ starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
+ ends = starts + cropsize
+ for j in range(patches):
+ idx = i * patches + j
+ X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
+ y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
+
+ return X_dataset, y_dataset
+
+
+def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
+ patch_list = []
+ patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
+ cropsize, sr, hop_length, n_fft, offset
+ )
+ os.makedirs(patch_dir, exist_ok=True)
+
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+ basename = os.path.splitext(os.path.basename(X_path))[0]
+
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
+ X, y = X / coef, y / coef
+
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
+
+ len_dataset = int(np.ceil(X.shape[2] / roi_size))
+ for j in range(len_dataset):
+ outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
+ start = j * roi_size
+ if not os.path.exists(outpath):
+ np.savez(
+ outpath,
+ X=X_pad[:, :, start : start + cropsize],
+ y=y_pad[:, :, start : start + cropsize],
+ )
+ patch_list.append(outpath)
+
+ return VocalRemoverValidationSet(patch_list)
diff --git a/tools/uvr5/lib/lib_v5/layers.py b/tools/uvr5/lib/lib_v5/layers.py
new file mode 100644
index 0000000..4fc1b5c
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_123812KB.py b/tools/uvr5/lib/lib_v5/layers_123812KB.py
new file mode 100644
index 0000000..4fc1b5c
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_123812KB.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_123821KB.py b/tools/uvr5/lib/lib_v5/layers_123821KB.py
new file mode 100644
index 0000000..4fc1b5c
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_123821KB.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_33966KB.py b/tools/uvr5/lib/lib_v5/layers_33966KB.py
new file mode 100644
index 0000000..9b127bc
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_33966KB.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv6 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv7 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ feat6 = self.conv6(x)
+ feat7 = self.conv7(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_537227KB.py b/tools/uvr5/lib/lib_v5/layers_537227KB.py
new file mode 100644
index 0000000..9b127bc
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_537227KB.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv6 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv7 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ feat6 = self.conv6(x)
+ feat7 = self.conv7(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_537238KB.py b/tools/uvr5/lib/lib_v5/layers_537238KB.py
new file mode 100644
index 0000000..9b127bc
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_537238KB.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv6 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv7 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ feat6 = self.conv6(x)
+ feat7 = self.conv7(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_new.py b/tools/uvr5/lib/lib_v5/layers_new.py
new file mode 100644
index 0000000..44153b6
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_new.py
@@ -0,0 +1,125 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+
+ def __call__(self, x):
+ h = self.conv1(x)
+ h = self.conv2(h)
+
+ return h
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+
+ h = self.conv1(x)
+ # h = self.conv2(h)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
+ self.conv3 = Conv2DBNActiv(
+ nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = Conv2DBNActiv(
+ nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = Conv2DBNActiv(
+ nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ out = self.bottleneck(out)
+
+ if self.dropout is not None:
+ out = self.dropout(out)
+
+ return out
+
+
+class LSTMModule(nn.Module):
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
+ super(LSTMModule, self).__init__()
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
+ self.lstm = nn.LSTM(
+ input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
+ )
+ self.dense = nn.Sequential(
+ nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
+ )
+
+ def forward(self, x):
+ N, _, nbins, nframes = x.size()
+ h = self.conv(x)[:, 0] # N, nbins, nframes
+ h = h.permute(2, 0, 1) # nframes, N, nbins
+ h, _ = self.lstm(h)
+ h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
+ h = h.reshape(nframes, N, 1, nbins)
+ h = h.permute(1, 2, 3, 0)
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/model_param_init.py b/tools/uvr5/lib/lib_v5/model_param_init.py
new file mode 100644
index 0000000..b995c0b
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/model_param_init.py
@@ -0,0 +1,69 @@
+import json
+import os
+import pathlib
+
+default_param = {}
+default_param["bins"] = 768
+default_param["unstable_bins"] = 9 # training only
+default_param["reduction_bins"] = 762 # training only
+default_param["sr"] = 44100
+default_param["pre_filter_start"] = 757
+default_param["pre_filter_stop"] = 768
+default_param["band"] = {}
+
+
+default_param["band"][1] = {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 960,
+ "crop_start": 0,
+ "crop_stop": 245,
+ "lpf_start": 61, # inference only
+ "res_type": "polyphase",
+}
+
+default_param["band"][2] = {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 1536,
+ "crop_start": 24,
+ "crop_stop": 547,
+ "hpf_start": 81, # inference only
+ "res_type": "sinc_best",
+}
+
+
+def int_keys(d):
+ r = {}
+ for k, v in d:
+ if k.isdigit():
+ k = int(k)
+ r[k] = v
+ return r
+
+
+class ModelParameters(object):
+ def __init__(self, config_path=""):
+ if ".pth" == pathlib.Path(config_path).suffix:
+ import zipfile
+
+ with zipfile.ZipFile(config_path, "r") as zip:
+ self.param = json.loads(
+ zip.read("param.json"), object_pairs_hook=int_keys
+ )
+ elif ".json" == pathlib.Path(config_path).suffix:
+ with open(config_path, "r") as f:
+ self.param = json.loads(f.read(), object_pairs_hook=int_keys)
+ else:
+ self.param = default_param
+
+ for k in [
+ "mid_side",
+ "mid_side_b",
+ "mid_side_b2",
+ "stereo_w",
+ "stereo_n",
+ "reverse",
+ ]:
+ if not k in self.param:
+ self.param[k] = False
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json
new file mode 100644
index 0000000..72cb449
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 16000,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 16000,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json
new file mode 100644
index 0000000..3c00ecf
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 32000,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 32000,
+ "pre_filter_start": 1000,
+ "pre_filter_stop": 1021
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json
new file mode 100644
index 0000000..55666ac
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 33075,
+ "hl": 384,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 33075,
+ "pre_filter_start": 1000,
+ "pre_filter_stop": 1021
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json
new file mode 100644
index 0000000..665abe2
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 1024,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json
new file mode 100644
index 0000000..0e8b16f
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json
@@ -0,0 +1,19 @@
+{
+ "bins": 256,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 0,
+ "crop_stop": 256,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 256,
+ "pre_filter_stop": 256
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json
new file mode 100644
index 0000000..3b38fca
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json
new file mode 100644
index 0000000..630df35
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 700,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 700
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json b/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json
new file mode 100644
index 0000000..ab9cf11
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json
@@ -0,0 +1,30 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 705,
+ "band": {
+ "1": {
+ "sr": 6000,
+ "hl": 66,
+ "n_fft": 512,
+ "crop_start": 0,
+ "crop_stop": 240,
+ "lpf_start": 60,
+ "lpf_stop": 118,
+ "res_type": "sinc_fastest"
+ },
+ "2": {
+ "sr": 32000,
+ "hl": 352,
+ "n_fft": 1024,
+ "crop_start": 22,
+ "crop_stop": 505,
+ "hpf_start": 44,
+ "hpf_stop": 23,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 32000,
+ "pre_filter_start": 710,
+ "pre_filter_stop": 731
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json b/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json
new file mode 100644
index 0000000..7faa216
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json
@@ -0,0 +1,30 @@
+{
+ "bins": 512,
+ "unstable_bins": 7,
+ "reduction_bins": 510,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 160,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 192,
+ "lpf_start": 41,
+ "lpf_stop": 139,
+ "res_type": "sinc_fastest"
+ },
+ "2": {
+ "sr": 44100,
+ "hl": 640,
+ "n_fft": 1024,
+ "crop_start": 10,
+ "crop_stop": 320,
+ "hpf_start": 47,
+ "hpf_stop": 15,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 510,
+ "pre_filter_stop": 512
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json b/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json
new file mode 100644
index 0000000..7e78175
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json
@@ -0,0 +1,30 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 705,
+ "band": {
+ "1": {
+ "sr": 6000,
+ "hl": 66,
+ "n_fft": 512,
+ "crop_start": 0,
+ "crop_stop": 240,
+ "lpf_start": 60,
+ "lpf_stop": 240,
+ "res_type": "sinc_fastest"
+ },
+ "2": {
+ "sr": 48000,
+ "hl": 528,
+ "n_fft": 1536,
+ "crop_start": 22,
+ "crop_stop": 505,
+ "hpf_start": 82,
+ "hpf_stop": 22,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 48000,
+ "pre_filter_start": 710,
+ "pre_filter_stop": 731
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json
new file mode 100644
index 0000000..d881d76
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json
@@ -0,0 +1,42 @@
+{
+ "bins": 768,
+ "unstable_bins": 5,
+ "reduction_bins": 733,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 278,
+ "lpf_start": 28,
+ "lpf_stop": 140,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 768,
+ "crop_start": 14,
+ "crop_stop": 322,
+ "hpf_start": 70,
+ "hpf_stop": 14,
+ "lpf_start": 283,
+ "lpf_stop": 314,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 131,
+ "crop_stop": 313,
+ "hpf_start": 154,
+ "hpf_stop": 141,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 757,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json
new file mode 100644
index 0000000..77ec198
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json
@@ -0,0 +1,43 @@
+{
+ "mid_side": true,
+ "bins": 768,
+ "unstable_bins": 5,
+ "reduction_bins": 733,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 278,
+ "lpf_start": 28,
+ "lpf_stop": 140,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 768,
+ "crop_start": 14,
+ "crop_stop": 322,
+ "hpf_start": 70,
+ "hpf_stop": 14,
+ "lpf_start": 283,
+ "lpf_stop": 314,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 131,
+ "crop_stop": 313,
+ "hpf_start": 154,
+ "hpf_stop": 141,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 757,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json
new file mode 100644
index 0000000..85ee8a7
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json
@@ -0,0 +1,43 @@
+{
+ "mid_side_b2": true,
+ "bins": 640,
+ "unstable_bins": 7,
+ "reduction_bins": 565,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 108,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 187,
+ "lpf_start": 92,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 216,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 212,
+ "hpf_start": 68,
+ "hpf_stop": 34,
+ "lpf_start": 174,
+ "lpf_stop": 209,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 432,
+ "n_fft": 640,
+ "crop_start": 66,
+ "crop_stop": 307,
+ "hpf_start": 86,
+ "hpf_stop": 72,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 639,
+ "pre_filter_stop": 640
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json
new file mode 100644
index 0000000..df12375
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json
@@ -0,0 +1,54 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json
new file mode 100644
index 0000000..e91b699
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json
@@ -0,0 +1,55 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "mid_side": true,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json
new file mode 100644
index 0000000..f852f28
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json
@@ -0,0 +1,55 @@
+{
+ "mid_side_b": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json
new file mode 100644
index 0000000..f852f28
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json
@@ -0,0 +1,55 @@
+{
+ "mid_side_b": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json
new file mode 100644
index 0000000..7a07d55
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json
@@ -0,0 +1,55 @@
+{
+ "reverse": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json
new file mode 100644
index 0000000..ba0cf34
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json
@@ -0,0 +1,55 @@
+{
+ "stereo_w": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json
new file mode 100644
index 0000000..33281a0
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json
@@ -0,0 +1,54 @@
+{
+ "bins": 672,
+ "unstable_bins": 8,
+ "reduction_bins": 637,
+ "band": {
+ "1": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 640,
+ "crop_start": 0,
+ "crop_stop": 85,
+ "lpf_start": 25,
+ "lpf_stop": 53,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 320,
+ "crop_start": 4,
+ "crop_stop": 87,
+ "hpf_start": 25,
+ "hpf_stop": 12,
+ "lpf_start": 31,
+ "lpf_stop": 62,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 14700,
+ "hl": 160,
+ "n_fft": 512,
+ "crop_start": 17,
+ "crop_stop": 216,
+ "hpf_start": 48,
+ "hpf_stop": 24,
+ "lpf_start": 139,
+ "lpf_stop": 210,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 480,
+ "n_fft": 960,
+ "crop_start": 78,
+ "crop_stop": 383,
+ "hpf_start": 130,
+ "hpf_stop": 86,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 668,
+ "pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json
new file mode 100644
index 0000000..2e5c770
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json
@@ -0,0 +1,55 @@
+{
+ "bins": 672,
+ "unstable_bins": 8,
+ "reduction_bins": 637,
+ "band": {
+ "1": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 640,
+ "crop_start": 0,
+ "crop_stop": 85,
+ "lpf_start": 25,
+ "lpf_stop": 53,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 320,
+ "crop_start": 4,
+ "crop_stop": 87,
+ "hpf_start": 25,
+ "hpf_stop": 12,
+ "lpf_start": 31,
+ "lpf_stop": 62,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 14700,
+ "hl": 160,
+ "n_fft": 512,
+ "crop_start": 17,
+ "crop_stop": 216,
+ "hpf_start": 48,
+ "hpf_stop": 24,
+ "lpf_start": 139,
+ "lpf_stop": 210,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 480,
+ "n_fft": 960,
+ "crop_start": 78,
+ "crop_stop": 383,
+ "hpf_start": 130,
+ "hpf_stop": 86,
+ "convert_channels": "stereo_n",
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 668,
+ "pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json
new file mode 100644
index 0000000..2a73bc9
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json
@@ -0,0 +1,54 @@
+{
+ "bins": 672,
+ "unstable_bins": 8,
+ "reduction_bins": 530,
+ "band": {
+ "1": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 640,
+ "crop_start": 0,
+ "crop_stop": 85,
+ "lpf_start": 25,
+ "lpf_stop": 53,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 320,
+ "crop_start": 4,
+ "crop_stop": 87,
+ "hpf_start": 25,
+ "hpf_stop": 12,
+ "lpf_start": 31,
+ "lpf_stop": 62,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 14700,
+ "hl": 160,
+ "n_fft": 512,
+ "crop_start": 17,
+ "crop_stop": 216,
+ "hpf_start": 48,
+ "hpf_stop": 24,
+ "lpf_start": 139,
+ "lpf_stop": 210,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 480,
+ "n_fft": 960,
+ "crop_start": 78,
+ "crop_stop": 383,
+ "hpf_start": 130,
+ "hpf_stop": 86,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 668,
+ "pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/ensemble.json b/tools/uvr5/lib/lib_v5/modelparams/ensemble.json
new file mode 100644
index 0000000..ee69beb
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/ensemble.json
@@ -0,0 +1,43 @@
+{
+ "mid_side_b2": true,
+ "bins": 1280,
+ "unstable_bins": 7,
+ "reduction_bins": 565,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 108,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 374,
+ "lpf_start": 92,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 216,
+ "n_fft": 1536,
+ "crop_start": 0,
+ "crop_stop": 424,
+ "hpf_start": 68,
+ "hpf_stop": 34,
+ "lpf_start": 348,
+ "lpf_stop": 418,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 432,
+ "n_fft": 1280,
+ "crop_start": 132,
+ "crop_stop": 614,
+ "hpf_start": 172,
+ "hpf_stop": 144,
+ "res_type": "polyphase"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1280,
+ "pre_filter_stop": 1280
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/nets.py b/tools/uvr5/lib/lib_v5/nets.py
new file mode 100644
index 0000000..5da3948
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets.py
@@ -0,0 +1,123 @@
+import layers
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 16)
+ self.stg1_high_band_net = BaseASPPNet(2, 16)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(8, 16)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(16, 32)
+
+ self.out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_123812KB.py b/tools/uvr5/lib/lib_v5/nets_123812KB.py
new file mode 100644
index 0000000..167d4cb
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_123812KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_123821KB.py b/tools/uvr5/lib/lib_v5/nets_123821KB.py
new file mode 100644
index 0000000..167d4cb
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_123821KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_33966KB.py b/tools/uvr5/lib/lib_v5/nets_33966KB.py
new file mode 100644
index 0000000..73a5b83
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_33966KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_33966KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 16)
+ self.stg1_high_band_net = BaseASPPNet(2, 16)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(8, 16)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(16, 32)
+
+ self.out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_537227KB.py b/tools/uvr5/lib/lib_v5/nets_537227KB.py
new file mode 100644
index 0000000..823b44f
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_537227KB.py
@@ -0,0 +1,123 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_537238KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 64)
+ self.stg1_high_band_net = BaseASPPNet(2, 64)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(32, 64)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(64, 128)
+
+ self.out = nn.Conv2d(128, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_537238KB.py b/tools/uvr5/lib/lib_v5/nets_537238KB.py
new file mode 100644
index 0000000..823b44f
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_537238KB.py
@@ -0,0 +1,123 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_537238KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 64)
+ self.stg1_high_band_net = BaseASPPNet(2, 64)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(32, 64)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(64, 128)
+
+ self.out = nn.Conv2d(128, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_61968KB.py b/tools/uvr5/lib/lib_v5/nets_61968KB.py
new file mode 100644
index 0000000..167d4cb
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_61968KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_new.py b/tools/uvr5/lib/lib_v5/nets_new.py
new file mode 100644
index 0000000..1c0f4fa
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_new.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_new
+
+
+class BaseNet(nn.Module):
+ def __init__(
+ self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
+ ):
+ super(BaseNet, self).__init__()
+ self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
+ self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
+ self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
+ self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
+ self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
+
+ self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
+
+ self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
+ self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
+ self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
+ self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
+ self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
+
+ def __call__(self, x):
+ e1 = self.enc1(x)
+ e2 = self.enc2(e1)
+ e3 = self.enc3(e2)
+ e4 = self.enc4(e3)
+ e5 = self.enc5(e4)
+
+ h = self.aspp(e5)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = torch.cat([h, self.lstm_dec2(h)], dim=1)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedNet(nn.Module):
+ def __init__(self, n_fft, nout=32, nout_lstm=128):
+ super(CascadedNet, self).__init__()
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+ self.nin_lstm = self.max_bin // 2
+ self.offset = 64
+
+ self.stg1_low_band_net = nn.Sequential(
+ BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
+ layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
+ )
+
+ self.stg1_high_band_net = BaseNet(
+ 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
+ )
+
+ self.stg2_low_band_net = nn.Sequential(
+ BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
+ layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
+ )
+ self.stg2_high_band_net = BaseNet(
+ nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
+ )
+
+ self.stg3_full_band_net = BaseNet(
+ 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
+ )
+
+ self.out = nn.Conv2d(nout, 2, 1, bias=False)
+ self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
+
+ def forward(self, x):
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ l1_in = x[:, :, :bandw]
+ h1_in = x[:, :, bandw:]
+ l1 = self.stg1_low_band_net(l1_in)
+ h1 = self.stg1_high_band_net(h1_in)
+ aux1 = torch.cat([l1, h1], dim=2)
+
+ l2_in = torch.cat([l1_in, l1], dim=1)
+ h2_in = torch.cat([h1_in, h1], dim=1)
+ l2 = self.stg2_low_band_net(l2_in)
+ h2 = self.stg2_high_band_net(h2_in)
+ aux2 = torch.cat([l2, h2], dim=2)
+
+ f3_in = torch.cat([x, aux1, aux2], dim=1)
+ f3 = self.stg3_full_band_net(f3_in)
+
+ mask = torch.sigmoid(self.out(f3))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux = torch.cat([aux1, aux2], dim=1)
+ aux = torch.sigmoid(self.aux_out(aux))
+ aux = F.pad(
+ input=aux,
+ pad=(0, 0, 0, self.output_bin - aux.size()[2]),
+ mode="replicate",
+ )
+ return mask, aux
+ else:
+ return mask
+
+ def predict_mask(self, x):
+ mask = self.forward(x)
+
+ if self.offset > 0:
+ mask = mask[:, :, :, self.offset : -self.offset]
+ assert mask.size()[3] > 0
+
+ return mask
+
+ def predict(self, x, aggressiveness=None):
+ mask = self.forward(x)
+ pred_mag = x * mask
+
+ if self.offset > 0:
+ pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
+ assert pred_mag.size()[3] > 0
+
+ return pred_mag
diff --git a/tools/uvr5/lib/lib_v5/spec_utils.py b/tools/uvr5/lib/lib_v5/spec_utils.py
new file mode 100644
index 0000000..a9634fd
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/spec_utils.py
@@ -0,0 +1,672 @@
+import hashlib
+import json
+import math
+import os
+
+import librosa
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+
+
+def crop_center(h1, h2):
+ h1_shape = h1.size()
+ h2_shape = h2.size()
+
+ if h1_shape[3] == h2_shape[3]:
+ return h1
+ elif h1_shape[3] < h2_shape[3]:
+ raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
+
+ # s_freq = (h2_shape[2] - h1_shape[2]) // 2
+ # e_freq = s_freq + h1_shape[2]
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
+ e_time = s_time + h2_shape[3]
+ h1 = h1[:, :, :, s_time:e_time]
+
+ return h1
+
+
+def wave_to_spectrogram(
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
+):
+ if reverse:
+ wave_left = np.flip(np.asfortranarray(wave[0]))
+ wave_right = np.flip(np.asfortranarray(wave[1]))
+ elif mid_side:
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+ elif mid_side_b2:
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
+ else:
+ wave_left = np.asfortranarray(wave[0])
+ wave_right = np.asfortranarray(wave[1])
+
+ spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
+
+ spec = np.asfortranarray([spec_left, spec_right])
+
+ return spec
+
+
+def wave_to_spectrogram_mt(
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
+):
+ import threading
+
+ if reverse:
+ wave_left = np.flip(np.asfortranarray(wave[0]))
+ wave_right = np.flip(np.asfortranarray(wave[1]))
+ elif mid_side:
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+ elif mid_side_b2:
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
+ else:
+ wave_left = np.asfortranarray(wave[0])
+ wave_right = np.asfortranarray(wave[1])
+
+ def run_thread(**kwargs):
+ global spec_left
+ spec_left = librosa.stft(**kwargs)
+
+ thread = threading.Thread(
+ target=run_thread,
+ kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
+ )
+ thread.start()
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
+ thread.join()
+
+ spec = np.asfortranarray([spec_left, spec_right])
+
+ return spec
+
+
+def combine_spectrograms(specs, mp):
+ l = min([specs[i].shape[2] for i in specs])
+ spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
+ offset = 0
+ bands_n = len(mp.param["band"])
+
+ for d in range(1, bands_n + 1):
+ h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
+ spec_c[:, offset : offset + h, :l] = specs[d][
+ :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
+ ]
+ offset += h
+
+ if offset > mp.param["bins"]:
+ raise ValueError("Too much bins")
+
+ # lowpass fiter
+ if (
+ mp.param["pre_filter_start"] > 0
+ ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
+ if bands_n == 1:
+ spec_c = fft_lp_filter(
+ spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
+ )
+ else:
+ gp = 1
+ for b in range(
+ mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
+ ):
+ g = math.pow(
+ 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
+ )
+ gp = g
+ spec_c[:, b, :] *= g
+
+ return np.asfortranarray(spec_c)
+
+
+def spectrogram_to_image(spec, mode="magnitude"):
+ if mode == "magnitude":
+ if np.iscomplexobj(spec):
+ y = np.abs(spec)
+ else:
+ y = spec
+ y = np.log10(y**2 + 1e-8)
+ elif mode == "phase":
+ if np.iscomplexobj(spec):
+ y = np.angle(spec)
+ else:
+ y = spec
+
+ y -= y.min()
+ y *= 255 / y.max()
+ img = np.uint8(y)
+
+ if y.ndim == 3:
+ img = img.transpose(1, 2, 0)
+ img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
+
+ return img
+
+
+def reduce_vocal_aggressively(X, y, softmask):
+ v = X - y
+ y_mag_tmp = np.abs(y)
+ v_mag_tmp = np.abs(v)
+
+ v_mask = v_mag_tmp > y_mag_tmp
+ y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
+
+ return y_mag * np.exp(1.0j * np.angle(y))
+
+
+def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
+ if min_range < fade_size * 2:
+ raise ValueError("min_range must be >= fade_area * 2")
+
+ mag = mag.copy()
+
+ idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
+ starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
+ ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
+ uninformative = np.where(ends - starts > min_range)[0]
+ if len(uninformative) > 0:
+ starts = starts[uninformative]
+ ends = ends[uninformative]
+ old_e = None
+ for s, e in zip(starts, ends):
+ if old_e is not None and s - old_e < fade_size:
+ s = old_e - fade_size * 2
+
+ if s != 0:
+ weight = np.linspace(0, 1, fade_size)
+ mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
+ else:
+ s -= fade_size
+
+ if e != mag.shape[2]:
+ weight = np.linspace(1, 0, fade_size)
+ mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
+ else:
+ e += fade_size
+
+ mag[:, :, s + fade_size : e - fade_size] += ref[
+ :, :, s + fade_size : e - fade_size
+ ]
+ old_e = e
+
+ return mag
+
+
+def align_wave_head_and_tail(a, b):
+ l = min([a[0].size, b[0].size])
+
+ return a[:l, :l], b[:l, :l]
+
+
+def cache_or_load(mix_path, inst_path, mp):
+ mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
+ inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
+
+ cache_dir = "mph{}".format(
+ hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
+ )
+ mix_cache_dir = os.path.join("cache", cache_dir)
+ inst_cache_dir = os.path.join("cache", cache_dir)
+
+ os.makedirs(mix_cache_dir, exist_ok=True)
+ os.makedirs(inst_cache_dir, exist_ok=True)
+
+ mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
+ inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
+
+ if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
+ X_spec_m = np.load(mix_cache_path)
+ y_spec_m = np.load(inst_cache_path)
+ else:
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+
+ for d in range(len(mp.param["band"]), 0, -1):
+ bp = mp.param["band"][d]
+
+ if d == len(mp.param["band"]): # high-end band
+ X_wave[d], _ = librosa.load(
+ mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
+ )
+ y_wave[d], _ = librosa.load(
+ inst_path,
+ bp["sr"],
+ False,
+ dtype=np.float32,
+ res_type=bp["res_type"],
+ )
+ else: # lower bands
+ X_wave[d] = librosa.resample(
+ X_wave[d + 1],
+ mp.param["band"][d + 1]["sr"],
+ bp["sr"],
+ res_type=bp["res_type"],
+ )
+ y_wave[d] = librosa.resample(
+ y_wave[d + 1],
+ mp.param["band"][d + 1]["sr"],
+ bp["sr"],
+ res_type=bp["res_type"],
+ )
+
+ X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
+
+ X_spec_s[d] = wave_to_spectrogram(
+ X_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+ y_spec_s[d] = wave_to_spectrogram(
+ y_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+
+ del X_wave, y_wave
+
+ X_spec_m = combine_spectrograms(X_spec_s, mp)
+ y_spec_m = combine_spectrograms(y_spec_s, mp)
+
+ if X_spec_m.shape != y_spec_m.shape:
+ raise ValueError("The combined spectrograms are different: " + mix_path)
+
+ _, ext = os.path.splitext(mix_path)
+
+ np.save(mix_cache_path, X_spec_m)
+ np.save(inst_cache_path, y_spec_m)
+
+ return X_spec_m, y_spec_m
+
+
+def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
+ spec_left = np.asfortranarray(spec[0])
+ spec_right = np.asfortranarray(spec[1])
+
+ wave_left = librosa.istft(spec_left, hop_length=hop_length)
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
+
+ if reverse:
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+ elif mid_side:
+ return np.asfortranarray(
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
+ )
+ elif mid_side_b2:
+ return np.asfortranarray(
+ [
+ np.add(wave_right / 1.25, 0.4 * wave_left),
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
+ ]
+ )
+ else:
+ return np.asfortranarray([wave_left, wave_right])
+
+
+def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
+ import threading
+
+ spec_left = np.asfortranarray(spec[0])
+ spec_right = np.asfortranarray(spec[1])
+
+ def run_thread(**kwargs):
+ global wave_left
+ wave_left = librosa.istft(**kwargs)
+
+ thread = threading.Thread(
+ target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
+ )
+ thread.start()
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
+ thread.join()
+
+ if reverse:
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+ elif mid_side:
+ return np.asfortranarray(
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
+ )
+ elif mid_side_b2:
+ return np.asfortranarray(
+ [
+ np.add(wave_right / 1.25, 0.4 * wave_left),
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
+ ]
+ )
+ else:
+ return np.asfortranarray([wave_left, wave_right])
+
+
+def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
+ wave_band = {}
+ bands_n = len(mp.param["band"])
+ offset = 0
+
+ for d in range(1, bands_n + 1):
+ bp = mp.param["band"][d]
+ spec_s = np.ndarray(
+ shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
+ )
+ h = bp["crop_stop"] - bp["crop_start"]
+ spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
+ :, offset : offset + h, :
+ ]
+
+ offset += h
+ if d == bands_n: # higher
+ if extra_bins_h: # if --high_end_process bypass
+ max_bin = bp["n_fft"] // 2
+ spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
+ :, :extra_bins_h, :
+ ]
+ if bp["hpf_start"] > 0:
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+ if bands_n == 1:
+ wave = spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+ else:
+ wave = np.add(
+ wave,
+ spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ ),
+ )
+ else:
+ sr = mp.param["band"][d + 1]["sr"]
+ if d == 1: # lower
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+ wave = librosa.resample(
+ spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ ),
+ bp["sr"],
+ sr,
+ res_type="sinc_fastest",
+ )
+ else: # mid
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+ wave2 = np.add(
+ wave,
+ spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ ),
+ )
+ # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
+ wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
+
+ return wave.T
+
+
+def fft_lp_filter(spec, bin_start, bin_stop):
+ g = 1.0
+ for b in range(bin_start, bin_stop):
+ g -= 1 / (bin_stop - bin_start)
+ spec[:, b, :] = g * spec[:, b, :]
+
+ spec[:, bin_stop:, :] *= 0
+
+ return spec
+
+
+def fft_hp_filter(spec, bin_start, bin_stop):
+ g = 1.0
+ for b in range(bin_start, bin_stop, -1):
+ g -= 1 / (bin_start - bin_stop)
+ spec[:, b, :] = g * spec[:, b, :]
+
+ spec[:, 0 : bin_stop + 1, :] *= 0
+
+ return spec
+
+
+def mirroring(a, spec_m, input_high_end, mp):
+ if "mirroring" == a:
+ mirror = np.flip(
+ np.abs(
+ spec_m[
+ :,
+ mp.param["pre_filter_start"]
+ - 10
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
+ - 10,
+ :,
+ ]
+ ),
+ 1,
+ )
+ mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
+
+ return np.where(
+ np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
+ )
+
+ if "mirroring2" == a:
+ mirror = np.flip(
+ np.abs(
+ spec_m[
+ :,
+ mp.param["pre_filter_start"]
+ - 10
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
+ - 10,
+ :,
+ ]
+ ),
+ 1,
+ )
+ mi = np.multiply(mirror, input_high_end * 1.7)
+
+ return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
+
+
+def ensembling(a, specs):
+ for i in range(1, len(specs)):
+ if i == 1:
+ spec = specs[0]
+
+ ln = min([spec.shape[2], specs[i].shape[2]])
+ spec = spec[:, :, :ln]
+ specs[i] = specs[i][:, :, :ln]
+
+ if "min_mag" == a:
+ spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
+ if "max_mag" == a:
+ spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
+
+ return spec
+
+
+def stft(wave, nfft, hl):
+ wave_left = np.asfortranarray(wave[0])
+ wave_right = np.asfortranarray(wave[1])
+ spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
+ spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
+ spec = np.asfortranarray([spec_left, spec_right])
+
+ return spec
+
+
+def istft(spec, hl):
+ spec_left = np.asfortranarray(spec[0])
+ spec_right = np.asfortranarray(spec[1])
+
+ wave_left = librosa.istft(spec_left, hop_length=hl)
+ wave_right = librosa.istft(spec_right, hop_length=hl)
+ wave = np.asfortranarray([wave_left, wave_right])
+
+
+if __name__ == "__main__":
+ import argparse
+ import sys
+ import time
+
+ import cv2
+ from model_param_init import ModelParameters
+
+ p = argparse.ArgumentParser()
+ p.add_argument(
+ "--algorithm",
+ "-a",
+ type=str,
+ choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
+ default="min_mag",
+ )
+ p.add_argument(
+ "--model_params",
+ "-m",
+ type=str,
+ default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
+ )
+ p.add_argument("--output_name", "-o", type=str, default="output")
+ p.add_argument("--vocals_only", "-v", action="store_true")
+ p.add_argument("input", nargs="+")
+ args = p.parse_args()
+
+ start_time = time.time()
+
+ if args.algorithm.startswith("invert") and len(args.input) != 2:
+ raise ValueError("There should be two input files.")
+
+ if not args.algorithm.startswith("invert") and len(args.input) < 2:
+ raise ValueError("There must be at least two input files.")
+
+ wave, specs = {}, {}
+ mp = ModelParameters(args.model_params)
+
+ for i in range(len(args.input)):
+ spec = {}
+
+ for d in range(len(mp.param["band"]), 0, -1):
+ bp = mp.param["band"][d]
+
+ if d == len(mp.param["band"]): # high-end band
+ wave[d], _ = librosa.load(
+ args.input[i],
+ bp["sr"],
+ False,
+ dtype=np.float32,
+ res_type=bp["res_type"],
+ )
+
+ if len(wave[d].shape) == 1: # mono to stereo
+ wave[d] = np.array([wave[d], wave[d]])
+ else: # lower bands
+ wave[d] = librosa.resample(
+ wave[d + 1],
+ mp.param["band"][d + 1]["sr"],
+ bp["sr"],
+ res_type=bp["res_type"],
+ )
+
+ spec[d] = wave_to_spectrogram(
+ wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+
+ specs[i] = combine_spectrograms(spec, mp)
+
+ del wave
+
+ if args.algorithm == "deep":
+ d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
+ v_spec = d_spec - specs[1]
+ sf.write(
+ os.path.join("{}.wav".format(args.output_name)),
+ cmb_spectrogram_to_wave(v_spec, mp),
+ mp.param["sr"],
+ )
+
+ if args.algorithm.startswith("invert"):
+ ln = min([specs[0].shape[2], specs[1].shape[2]])
+ specs[0] = specs[0][:, :, :ln]
+ specs[1] = specs[1][:, :, :ln]
+
+ if "invert_p" == args.algorithm:
+ X_mag = np.abs(specs[0])
+ y_mag = np.abs(specs[1])
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
+ v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
+ else:
+ specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
+ v_spec = specs[0] - specs[1]
+
+ if not args.vocals_only:
+ X_mag = np.abs(specs[0])
+ y_mag = np.abs(specs[1])
+ v_mag = np.abs(v_spec)
+
+ X_image = spectrogram_to_image(X_mag)
+ y_image = spectrogram_to_image(y_mag)
+ v_image = spectrogram_to_image(v_mag)
+
+ cv2.imwrite("{}_X.png".format(args.output_name), X_image)
+ cv2.imwrite("{}_y.png".format(args.output_name), y_image)
+ cv2.imwrite("{}_v.png".format(args.output_name), v_image)
+
+ sf.write(
+ "{}_X.wav".format(args.output_name),
+ cmb_spectrogram_to_wave(specs[0], mp),
+ mp.param["sr"],
+ )
+ sf.write(
+ "{}_y.wav".format(args.output_name),
+ cmb_spectrogram_to_wave(specs[1], mp),
+ mp.param["sr"],
+ )
+
+ sf.write(
+ "{}_v.wav".format(args.output_name),
+ cmb_spectrogram_to_wave(v_spec, mp),
+ mp.param["sr"],
+ )
+ else:
+ if not args.algorithm == "deep":
+ sf.write(
+ os.path.join("ensembled", "{}.wav".format(args.output_name)),
+ cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
+ mp.param["sr"],
+ )
+
+ if args.algorithm == "align":
+ trackalignment = [
+ {
+ "file1": '"{}"'.format(args.input[0]),
+ "file2": '"{}"'.format(args.input[1]),
+ }
+ ]
+
+ for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
+ os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
+
+ # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
diff --git a/tools/uvr5/lib/name_params.json b/tools/uvr5/lib/name_params.json
new file mode 100644
index 0000000..4e5ee7b
--- /dev/null
+++ b/tools/uvr5/lib/name_params.json
@@ -0,0 +1,263 @@
+{
+ "equivalent" : [
+ {
+ "model_hash_name" : [
+ {
+ "hash_name": "47939caf0cfe52a0e81442b85b971dfd",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
+ "param_name": "4band_v2"
+ },
+ {
+ "hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
+ "param_name": "4band_v2"
+ },
+ {
+ "hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "a82f14e75892e55e994376edbf0c8435",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
+ "param_name": "4band_v2_sn"
+ },
+ {
+ "hash_name": "08611fb99bd59eaa79ad27c58d137727",
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
+ "param_name": "4band_v2_sn"
+ },
+ {
+ "hash_name": "5c7bbca45a187e81abbbd351606164e5",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
+ "param_name": "3band_44100_msb2"
+ },
+ {
+ "hash_name": "d6b2cb685a058a091e5e7098192d3233",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
+ "param_name": "3band_44100_msb2"
+ },
+ {
+ "hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "c3448ec923fa0edf3d03a19e633faa53",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "68aa2c8093d0080704b200d140f59e54",
+ "model_params": "lib/lib_v5/modelparams/3band_44100.json",
+ "param_name": "3band_44100"
+ },
+ {
+ "hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
+ "param_name": "3band_44100_mid.json"
+ },
+ {
+ "hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
+ "param_name": "3band_44100_mid.json"
+ },
+ {
+ "hash_name": "52fdca89576f06cf4340b74a4730ee5f",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100.json"
+ },
+ {
+ "hash_name": "41191165b05d38fc77f072fa9e8e8a30",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100.json"
+ },
+ {
+ "hash_name": "89e83b511ad474592689e562d5b1f80e",
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
+ "param_name": "2band_32000.json"
+ },
+ {
+ "hash_name": "0b954da81d453b716b114d6d7c95177f",
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
+ "param_name": "2band_32000.json"
+ }
+
+ ],
+ "v4 Models": [
+ {
+ "hash_name": "6a00461c51c2920fd68937d4609ed6c8",
+ "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
+ "param_name": "1band_sr16000_hl512"
+ },
+ {
+ "hash_name": "0ab504864d20f1bd378fe9c81ef37140",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr32000_hl512"
+ },
+ {
+ "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr32000_hl512"
+ },
+ {
+ "hash_name": "80ab74d65e515caa3622728d2de07d23",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr32000_hl512"
+ },
+ {
+ "hash_name": "edc115e7fc523245062200c00caa847f",
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
+ "param_name": "1band_sr33075_hl384"
+ },
+ {
+ "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
+ "param_name": "1band_sr33075_hl384"
+ },
+ {
+ "hash_name": "b58090534c52cbc3e9b5104bad666ef2",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
+ "param_name": "1band_sr44100_hl512"
+ },
+ {
+ "hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
+ "param_name": "1band_sr44100_hl512"
+ },
+ {
+ "hash_name": "ae702fed0238afb5346db8356fe25f13",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
+ "param_name": "1band_sr44100_hl1024"
+ }
+ ]
+ }
+ ],
+ "User Models" : [
+ {
+ "1 Band": [
+ {
+ "hash_name": "1band_sr16000_hl512",
+ "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
+ "param_name": "1band_sr16000_hl512"
+ },
+ {
+ "hash_name": "1band_sr32000_hl512",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr16000_hl512"
+ },
+ {
+ "hash_name": "1band_sr33075_hl384",
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
+ "param_name": "1band_sr33075_hl384"
+ },
+ {
+ "hash_name": "1band_sr44100_hl256",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl256.json",
+ "param_name": "1band_sr44100_hl256"
+ },
+ {
+ "hash_name": "1band_sr44100_hl512",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
+ "param_name": "1band_sr44100_hl512"
+ },
+ {
+ "hash_name": "1band_sr44100_hl1024",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
+ "param_name": "1band_sr44100_hl1024"
+ }
+ ],
+ "2 Band": [
+ {
+ "hash_name": "2band_44100_lofi",
+ "model_params": "lib/lib_v5/modelparams/2band_44100_lofi.json",
+ "param_name": "2band_44100_lofi"
+ },
+ {
+ "hash_name": "2band_32000",
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
+ "param_name": "2band_32000"
+ },
+ {
+ "hash_name": "2band_48000",
+ "model_params": "lib/lib_v5/modelparams/2band_48000.json",
+ "param_name": "2band_48000"
+ }
+ ],
+ "3 Band": [
+ {
+ "hash_name": "3band_44100",
+ "model_params": "lib/lib_v5/modelparams/3band_44100.json",
+ "param_name": "3band_44100"
+ },
+ {
+ "hash_name": "3band_44100_mid",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
+ "param_name": "3band_44100_mid"
+ },
+ {
+ "hash_name": "3band_44100_msb2",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
+ "param_name": "3band_44100_msb2"
+ }
+ ],
+ "4 Band": [
+ {
+ "hash_name": "4band_44100",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "4band_44100_mid",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_mid.json",
+ "param_name": "4band_44100_mid"
+ },
+ {
+ "hash_name": "4band_44100_msb",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_msb.json",
+ "param_name": "4band_44100_msb"
+ },
+ {
+ "hash_name": "4band_44100_msb2",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_msb2.json",
+ "param_name": "4band_44100_msb2"
+ },
+ {
+ "hash_name": "4band_44100_reverse",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_reverse.json",
+ "param_name": "4band_44100_reverse"
+ },
+ {
+ "hash_name": "4band_44100_sw",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_sw.json",
+ "param_name": "4band_44100_sw"
+ },
+ {
+ "hash_name": "4band_v2",
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
+ "param_name": "4band_v2"
+ },
+ {
+ "hash_name": "4band_v2_sn",
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
+ "param_name": "4band_v2_sn"
+ },
+ {
+ "hash_name": "tmodelparam",
+ "model_params": "lib/lib_v5/modelparams/tmodelparam.json",
+ "param_name": "User Model Param Set"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/utils.py b/tools/uvr5/lib/utils.py
new file mode 100644
index 0000000..946eb0c
--- /dev/null
+++ b/tools/uvr5/lib/utils.py
@@ -0,0 +1,121 @@
+import json
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+
+def load_data(file_name: str = "./lib/name_params.json") -> dict:
+ with open(file_name, "r") as f:
+ data = json.load(f)
+
+ return data
+
+
+def make_padding(width, cropsize, offset):
+ left = offset
+ roi_size = cropsize - left * 2
+ if roi_size == 0:
+ roi_size = cropsize
+ right = roi_size - (width % roi_size) + left
+
+ return left, right, roi_size
+
+
+def inference(X_spec, device, model, aggressiveness, data):
+ """
+ data : dic configs
+ """
+
+ def _execute(
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
+ ):
+ model.eval()
+ with torch.no_grad():
+ preds = []
+
+ iterations = [n_window]
+
+ total_iterations = sum(iterations)
+ for i in tqdm(range(n_window)):
+ start = i * roi_size
+ X_mag_window = X_mag_pad[
+ None, :, :, start : start + data["window_size"]
+ ]
+ X_mag_window = torch.from_numpy(X_mag_window)
+ if is_half:
+ X_mag_window = X_mag_window.half()
+ X_mag_window = X_mag_window.to(device)
+
+ pred = model.predict(X_mag_window, aggressiveness)
+
+ pred = pred.detach().cpu().numpy()
+ preds.append(pred[0])
+
+ pred = np.concatenate(preds, axis=2)
+ return pred
+
+ def preprocess(X_spec):
+ X_mag = np.abs(X_spec)
+ X_phase = np.angle(X_spec)
+
+ return X_mag, X_phase
+
+ X_mag, X_phase = preprocess(X_spec)
+
+ coef = X_mag.max()
+ X_mag_pre = X_mag / coef
+
+ n_frame = X_mag_pre.shape[2]
+ pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
+ n_window = int(np.ceil(n_frame / roi_size))
+
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+
+ if list(model.state_dict().values())[0].dtype == torch.float16:
+ is_half = True
+ else:
+ is_half = False
+ pred = _execute(
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
+ )
+ pred = pred[:, :, :n_frame]
+
+ if data["tta"]:
+ pad_l += roi_size // 2
+ pad_r += roi_size // 2
+ n_window += 1
+
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+
+ pred_tta = _execute(
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
+ )
+ pred_tta = pred_tta[:, :, roi_size // 2 :]
+ pred_tta = pred_tta[:, :, :n_frame]
+
+ return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
+ else:
+ return pred * coef, X_mag, np.exp(1.0j * X_phase)
+
+
+def _get_name_params(model_path, model_hash):
+ data = load_data()
+ flag = False
+ ModelName = model_path
+ for type in list(data):
+ for model in list(data[type][0]):
+ for i in range(len(data[type][0][model])):
+ if str(data[type][0][model][i]["hash_name"]) == model_hash:
+ flag = True
+ elif str(data[type][0][model][i]["hash_name"]) in ModelName:
+ flag = True
+
+ if flag:
+ model_params_auto = data[type][0][model][i]["model_params"]
+ param_name_auto = data[type][0][model][i]["param_name"]
+ if type == "equivalent":
+ return param_name_auto, model_params_auto
+ else:
+ flag = False
+ return param_name_auto, model_params_auto
diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py
new file mode 100644
index 0000000..73a335d
--- /dev/null
+++ b/tools/uvr5/mdxnet.py
@@ -0,0 +1,256 @@
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from tqdm import tqdm
+
+cpu = torch.device("cpu")
+
+
+class ConvTDFNetTrim:
+ def __init__(
+ self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024
+ ):
+ super(ConvTDFNetTrim, self).__init__()
+
+ self.dim_f = dim_f
+ self.dim_t = 2**dim_t
+ self.n_fft = n_fft
+ self.hop = hop
+ self.n_bins = self.n_fft // 2 + 1
+ self.chunk_size = hop * (self.dim_t - 1)
+ self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(
+ device
+ )
+ self.target_name = target_name
+ self.blender = "blender" in model_name
+
+ self.dim_c = 4
+ out_c = self.dim_c * 4 if target_name == "*" else self.dim_c
+ self.freq_pad = torch.zeros(
+ [1, out_c, self.n_bins - self.dim_f, self.dim_t]
+ ).to(device)
+
+ self.n = L // 2
+
+ def stft(self, x):
+ x = x.reshape([-1, self.chunk_size])
+ x = torch.stft(
+ x,
+ n_fft=self.n_fft,
+ hop_length=self.hop,
+ window=self.window,
+ center=True,
+ return_complex=True,
+ )
+ x = torch.view_as_real(x)
+ x = x.permute([0, 3, 1, 2])
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+ [-1, self.dim_c, self.n_bins, self.dim_t]
+ )
+ return x[:, :, : self.dim_f]
+
+ def istft(self, x, freq_pad=None):
+ freq_pad = (
+ self.freq_pad.repeat([x.shape[0], 1, 1, 1])
+ if freq_pad is None
+ else freq_pad
+ )
+ x = torch.cat([x, freq_pad], -2)
+ c = 4 * 2 if self.target_name == "*" else 2
+ x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
+ [-1, 2, self.n_bins, self.dim_t]
+ )
+ x = x.permute([0, 2, 3, 1])
+ x = x.contiguous()
+ x = torch.view_as_complex(x)
+ x = torch.istft(
+ x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
+ )
+ return x.reshape([-1, c, self.chunk_size])
+
+
+def get_models(device, dim_f, dim_t, n_fft):
+ return ConvTDFNetTrim(
+ device=device,
+ model_name="Conv-TDF",
+ target_name="vocals",
+ L=11,
+ dim_f=dim_f,
+ dim_t=dim_t,
+ n_fft=n_fft,
+ )
+
+
+class Predictor:
+ def __init__(self, args):
+ import onnxruntime as ort
+
+ logger.info(ort.get_available_providers())
+ self.args = args
+ self.model_ = get_models(
+ device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft
+ )
+ self.model = ort.InferenceSession(
+ os.path.join(args.onnx, self.model_.target_name + ".onnx"),
+ providers=[
+ "CUDAExecutionProvider",
+ "DmlExecutionProvider",
+ "CPUExecutionProvider",
+ ],
+ )
+ logger.info("ONNX load done")
+
+ def demix(self, mix):
+ samples = mix.shape[-1]
+ margin = self.args.margin
+ chunk_size = self.args.chunks * 44100
+ assert not margin == 0, "margin cannot be zero!"
+ if margin > chunk_size:
+ margin = chunk_size
+
+ segmented_mix = {}
+
+ if self.args.chunks == 0 or samples < chunk_size:
+ chunk_size = samples
+
+ counter = -1
+ for skip in range(0, samples, chunk_size):
+ counter += 1
+
+ s_margin = 0 if counter == 0 else margin
+ end = min(skip + chunk_size + margin, samples)
+
+ start = skip - s_margin
+
+ segmented_mix[skip] = mix[:, start:end].copy()
+ if end == samples:
+ break
+
+ sources = self.demix_base(segmented_mix, margin_size=margin)
+ """
+ mix:(2,big_sample)
+ segmented_mix:offset->(2,small_sample)
+ sources:(1,2,big_sample)
+ """
+ return sources
+
+ def demix_base(self, mixes, margin_size):
+ chunked_sources = []
+ progress_bar = tqdm(total=len(mixes))
+ progress_bar.set_description("Processing")
+ for mix in mixes:
+ cmix = mixes[mix]
+ sources = []
+ n_sample = cmix.shape[1]
+ model = self.model_
+ trim = model.n_fft // 2
+ gen_size = model.chunk_size - 2 * trim
+ pad = gen_size - n_sample % gen_size
+ mix_p = np.concatenate(
+ (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
+ )
+ mix_waves = []
+ i = 0
+ while i < n_sample + pad:
+ waves = np.array(mix_p[:, i : i + model.chunk_size])
+ mix_waves.append(waves)
+ i += gen_size
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
+ with torch.no_grad():
+ _ort = self.model
+ spek = model.stft(mix_waves)
+ if self.args.denoise:
+ spec_pred = (
+ -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
+ + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
+ )
+ tar_waves = model.istft(torch.tensor(spec_pred))
+ else:
+ tar_waves = model.istft(
+ torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
+ )
+ tar_signal = (
+ tar_waves[:, :, trim:-trim]
+ .transpose(0, 1)
+ .reshape(2, -1)
+ .numpy()[:, :-pad]
+ )
+
+ start = 0 if mix == 0 else margin_size
+ end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
+ if margin_size == 0:
+ end = None
+ sources.append(tar_signal[:, start:end])
+
+ progress_bar.update(1)
+
+ chunked_sources.append(sources)
+ _sources = np.concatenate(chunked_sources, axis=-1)
+ # del self.model
+ progress_bar.close()
+ return _sources
+
+ def prediction(self, m, vocal_root, others_root, format):
+ os.makedirs(vocal_root, exist_ok=True)
+ os.makedirs(others_root, exist_ok=True)
+ basename = os.path.basename(m)
+ mix, rate = librosa.load(m, mono=False, sr=44100)
+ if mix.ndim == 1:
+ mix = np.asfortranarray([mix, mix])
+ mix = mix.T
+ sources = self.demix(mix.T)
+ opt = sources[0].T
+ if format in ["wav", "flac"]:
+ sf.write(
+ "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
+ )
+ sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
+ else:
+ path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
+ path_other = "%s/%s_others.wav" % (others_root, basename)
+ sf.write(path_vocal, mix - opt, rate)
+ sf.write(path_other, opt, rate)
+ opt_path_vocal = path_vocal[:-4] + ".%s" % format
+ opt_path_other = path_other[:-4] + ".%s" % format
+ if os.path.exists(path_vocal):
+ os.system(
+ "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal)
+ )
+ if os.path.exists(opt_path_vocal):
+ try:
+ os.remove(path_vocal)
+ except:
+ pass
+ if os.path.exists(path_other):
+ os.system(
+ "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other)
+ )
+ if os.path.exists(opt_path_other):
+ try:
+ os.remove(path_other)
+ except:
+ pass
+
+
+class MDXNetDereverb:
+ def __init__(self, chunks, device):
+ self.onnx = "%s/uvr5_weights/onnx_dereverb_By_FoxJoy"%os.path.dirname(os.path.abspath(__file__))
+ self.shifts = 10 # 'Predict with randomised equivariant stabilisation'
+ self.mixing = "min_mag" # ['default','min_mag','max_mag']
+ self.chunks = chunks
+ self.margin = 44100
+ self.dim_t = 9
+ self.dim_f = 3072
+ self.n_fft = 6144
+ self.denoise = True
+ self.pred = Predictor(self)
+ self.device = device
+
+ def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False):
+ self.pred.prediction(input, vocal_root, others_root, format)
diff --git a/tools/uvr5/vr.py b/tools/uvr5/vr.py
new file mode 100644
index 0000000..448e57f
--- /dev/null
+++ b/tools/uvr5/vr.py
@@ -0,0 +1,367 @@
+import os,sys
+parent_directory = os.path.dirname(os.path.abspath(__file__))
+import logging,pdb
+logger = logging.getLogger(__name__)
+
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from lib.lib_v5 import nets_61968KB as Nets
+from lib.lib_v5 import spec_utils
+from lib.lib_v5.model_param_init import ModelParameters
+from lib.lib_v5.nets_new import CascadedNet
+from lib.utils import inference
+
+
+class AudioPre:
+ def __init__(self, agg, model_path, device, is_half, tta=False):
+ self.model_path = model_path
+ self.device = device
+ self.data = {
+ # Processing Options
+ "postprocess": False,
+ "tta": tta,
+ # Constants
+ "window_size": 512,
+ "agg": agg,
+ "high_end_process": "mirroring",
+ }
+ mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json"%parent_directory)
+ model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
+ cpk = torch.load(model_path, map_location="cpu")
+ model.load_state_dict(cpk)
+ model.eval()
+ if is_half:
+ model = model.half().to(device)
+ else:
+ model = model.to(device)
+
+ self.mp = mp
+ self.model = model
+
+ def _path_audio_(
+ self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
+ ):
+ if ins_root is None and vocal_root is None:
+ return "No save root."
+ name = os.path.basename(music_file)
+ if ins_root is not None:
+ os.makedirs(ins_root, exist_ok=True)
+ if vocal_root is not None:
+ os.makedirs(vocal_root, exist_ok=True)
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+ bands_n = len(self.mp.param["band"])
+ # print(bands_n)
+ for d in range(bands_n, 0, -1):
+ bp = self.mp.param["band"][d]
+ if d == bands_n: # high-end band
+ (
+ X_wave[d],
+ _,
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
+ music_file,
+ bp["sr"],
+ False,
+ dtype=np.float32,
+ res_type=bp["res_type"],
+ )
+ if X_wave[d].ndim == 1:
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+ else: # lower bands
+ X_wave[d] = librosa.core.resample(
+ X_wave[d + 1],
+ self.mp.param["band"][d + 1]["sr"],
+ bp["sr"],
+ res_type=bp["res_type"],
+ )
+ # Stft of wave source
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+ X_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ self.mp.param["mid_side"],
+ self.mp.param["mid_side_b2"],
+ self.mp.param["reverse"],
+ )
+ # pdb.set_trace()
+ if d == bands_n and self.data["high_end_process"] != "none":
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
+ )
+ input_high_end = X_spec_s[d][
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
+ ]
+
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+ aggresive_set = float(self.data["agg"] / 100)
+ aggressiveness = {
+ "value": aggresive_set,
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
+ }
+ with torch.no_grad():
+ pred, X_mag, X_phase = inference(
+ X_spec_m, self.device, self.model, aggressiveness, self.data
+ )
+ # Postprocess
+ if self.data["postprocess"]:
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
+ pred = spec_utils.mask_silence(pred, pred_inv)
+ y_spec_m = pred * X_phase
+ v_spec_m = X_spec_m - y_spec_m
+
+ if ins_root is not None:
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+ )
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+ logger.info("%s instruments done" % name)
+ if is_hp3 == True:
+ head = "vocal_"
+ else:
+ head = "instrument_"
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ ins_root,
+ head + "{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ ) #
+ else:
+ path = os.path.join(
+ ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
+ if vocal_root is not None:
+ if is_hp3 == True:
+ head = "instrument_"
+ else:
+ head = "vocal_"
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+ )
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
+ logger.info("%s vocals done" % name)
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ vocal_root,
+ head + "{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ else:
+ path = os.path.join(
+ vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
+
+
+class AudioPreDeEcho:
+ def __init__(self, agg, model_path, device, is_half, tta=False):
+ self.model_path = model_path
+ self.device = device
+ self.data = {
+ # Processing Options
+ "postprocess": False,
+ "tta": tta,
+ # Constants
+ "window_size": 512,
+ "agg": agg,
+ "high_end_process": "mirroring",
+ }
+ mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v3.json"%parent_directory)
+ nout = 64 if "DeReverb" in model_path else 48
+ model = CascadedNet(mp.param["bins"] * 2, nout)
+ cpk = torch.load(model_path, map_location="cpu")
+ model.load_state_dict(cpk)
+ model.eval()
+ if is_half:
+ model = model.half().to(device)
+ else:
+ model = model.to(device)
+
+ self.mp = mp
+ self.model = model
+
+ def _path_audio_(
+ self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
+ ): # 3个VR模型vocal和ins是反的
+ if ins_root is None and vocal_root is None:
+ return "No save root."
+ name = os.path.basename(music_file)
+ if ins_root is not None:
+ os.makedirs(ins_root, exist_ok=True)
+ if vocal_root is not None:
+ os.makedirs(vocal_root, exist_ok=True)
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+ bands_n = len(self.mp.param["band"])
+ # print(bands_n)
+ for d in range(bands_n, 0, -1):
+ bp = self.mp.param["band"][d]
+ if d == bands_n: # high-end band
+ (
+ X_wave[d],
+ _,
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
+ music_file,
+ bp["sr"],
+ False,
+ dtype=np.float32,
+ res_type=bp["res_type"],
+ )
+ if X_wave[d].ndim == 1:
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+ else: # lower bands
+ X_wave[d] = librosa.core.resample(
+ X_wave[d + 1],
+ self.mp.param["band"][d + 1]["sr"],
+ bp["sr"],
+ res_type=bp["res_type"],
+ )
+ # Stft of wave source
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+ X_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ self.mp.param["mid_side"],
+ self.mp.param["mid_side_b2"],
+ self.mp.param["reverse"],
+ )
+ # pdb.set_trace()
+ if d == bands_n and self.data["high_end_process"] != "none":
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
+ )
+ input_high_end = X_spec_s[d][
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
+ ]
+
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+ aggresive_set = float(self.data["agg"] / 100)
+ aggressiveness = {
+ "value": aggresive_set,
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
+ }
+ with torch.no_grad():
+ pred, X_mag, X_phase = inference(
+ X_spec_m, self.device, self.model, aggressiveness, self.data
+ )
+ # Postprocess
+ if self.data["postprocess"]:
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
+ pred = spec_utils.mask_silence(pred, pred_inv)
+ y_spec_m = pred * X_phase
+ v_spec_m = X_spec_m - y_spec_m
+
+ if ins_root is not None:
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+ )
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+ logger.info("%s instruments done" % name)
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ ins_root,
+ "vocal_{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ ) #
+ else:
+ path = os.path.join(
+ ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
+ if vocal_root is not None:
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+ )
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
+ logger.info("%s vocals done" % name)
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ vocal_root,
+ "instrument_{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ else:
+ path = os.path.join(
+ vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py
new file mode 100644
index 0000000..051ece5
--- /dev/null
+++ b/tools/uvr5/webui.py
@@ -0,0 +1,184 @@
+import os
+import traceback,gradio as gr
+import logging
+from i18n.i18n import I18nAuto
+i18n = I18nAuto()
+
+logger = logging.getLogger(__name__)
+import ffmpeg
+import torch
+import sys
+from mdxnet import MDXNetDereverb
+from vr import AudioPre, AudioPreDeEcho
+
+weight_uvr5_root = "tools/uvr5/uvr5_weights"
+uvr5_names = []
+for name in os.listdir(weight_uvr5_root):
+ if name.endswith(".pth") or "onnx" in name:
+ uvr5_names.append(name.replace(".pth", ""))
+
+device=sys.argv[1]
+is_half=sys.argv[2]
+
+
+def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
+ infos = []
+ try:
+ inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+ save_root_vocal = (
+ save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+ )
+ save_root_ins = (
+ save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+ )
+ if model_name == "onnx_dereverb_By_FoxJoy":
+ pre_fun = MDXNetDereverb(15, device)
+ else:
+ func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
+ pre_fun = func(
+ agg=int(agg),
+ model_path=os.path.join(
+ weight_uvr5_root, model_name + ".pth"
+ ),
+ device=device,
+ is_half=is_half,
+ )
+ is_hp3 = "HP3" in model_name
+ if inp_root != "":
+ paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
+ else:
+ paths = [path.name for path in paths]
+ for path in paths:
+ inp_path = os.path.join(inp_root, path)
+ need_reformat = 1
+ done = 0
+ try:
+ info = ffmpeg.probe(inp_path, cmd="ffprobe")
+ if (
+ info["streams"][0]["channels"] == 2
+ and info["streams"][0]["sample_rate"] == "44100"
+ ):
+ need_reformat = 0
+ pre_fun._path_audio_(
+ inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
+ )
+ done = 1
+ except:
+ need_reformat = 1
+ traceback.print_exc()
+ if need_reformat == 1:
+ tmp_path = "%s/%s.reformatted.wav" % (
+ os.path.join(os.environ["TEMP"]),
+ os.path.basename(inp_path),
+ )
+ os.system(
+ "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
+ % (inp_path, tmp_path)
+ )
+ inp_path = tmp_path
+ try:
+ if done == 0:
+ pre_fun._path_audio_(
+ inp_path, save_root_ins, save_root_vocal, format0
+ )
+ infos.append("%s->Success" % (os.path.basename(inp_path)))
+ yield "\n".join(infos)
+ except:
+ try:
+ if done == 0:
+ pre_fun._path_audio_(
+ inp_path, save_root_ins, save_root_vocal, format0
+ )
+ infos.append("%s->Success" % (os.path.basename(inp_path)))
+ yield "\n".join(infos)
+ except:
+ infos.append(
+ "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
+ )
+ yield "\n".join(infos)
+ except:
+ infos.append(traceback.format_exc())
+ yield "\n".join(infos)
+ finally:
+ try:
+ if model_name == "onnx_dereverb_By_FoxJoy":
+ del pre_fun.pred.model
+ del pre_fun.pred.model_
+ else:
+ del pre_fun.model
+ del pre_fun
+ except:
+ traceback.print_exc()
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ logger.info("Executed torch.cuda.empty_cache()")
+ yield "\n".join(infos)
+
+
+with gr.Blocks(title="RVC WebUI") as app:
+ gr.Markdown(
+ value=
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE."
+ )
+ with gr.Tabs():
+ with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
+ with gr.Group():
+ gr.Markdown(
+ value=i18n(
+ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"
+ )
+ )
+ with gr.Row():
+ with gr.Column():
+ dir_wav_input = gr.Textbox(
+ label=i18n("输入待处理音频文件夹路径"),
+ placeholder="C:\\Users\\Desktop\\todo-songs",
+ )
+ wav_inputs = gr.File(
+ file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
+ )
+ with gr.Column():
+ model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names)
+ agg = gr.Slider(
+ minimum=0,
+ maximum=20,
+ step=1,
+ label="人声提取激进程度",
+ value=10,
+ interactive=True,
+ visible=False, # 先不开放调整
+ )
+ opt_vocal_root = gr.Textbox(
+ label=i18n("指定输出主人声文件夹"), value="output/uvr5_opt"
+ )
+ opt_ins_root = gr.Textbox(
+ label=i18n("指定输出非主人声文件夹"), value="output/uvr5_opt"
+ )
+ format0 = gr.Radio(
+ label=i18n("导出文件格式"),
+ choices=["wav", "flac", "mp3", "m4a"],
+ value="flac",
+ interactive=True,
+ )
+ but2 = gr.Button(i18n("转换"), variant="primary")
+ vc_output4 = gr.Textbox(label=i18n("输出信息"))
+ but2.click(
+ uvr,
+ [
+ model_choose,
+ dir_wav_input,
+ opt_vocal_root,
+ wav_inputs,
+ opt_ins_root,
+ agg,
+ format0,
+ ],
+ [vc_output4],
+ api_name="uvr_convert",
+ )
+app.queue(concurrency_count=511, max_size=1022).launch(
+ server_name="0.0.0.0",
+ inbrowser=True,
+ server_port=9873,
+ quiet=True,
+)
\ No newline at end of file