rewrite vr-architecture
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "rvc"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
description = "An easy-to-use Voice Conversion framework based on VITS."
|
||||
authors = ["Ftps <ftpsflandre@gmail.com>"]
|
||||
readme = "README.md"
|
||||
|
||||
@@ -243,7 +243,7 @@ class MDXNetDereverb:
|
||||
self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy"
|
||||
self.shifts = 10 # 'Predict with randomised equivariant stabilisation'
|
||||
self.mixing = "min_mag" # ['default','min_mag','max_mag']
|
||||
self.chunks = chunks
|
||||
self.chunks = chunks # 15
|
||||
self.margin = 44100
|
||||
self.dim_t = 9
|
||||
self.dim_f = 3072
|
||||
|
||||
@@ -10,7 +10,7 @@ from pydub import AudioSegment
|
||||
|
||||
from rvc.configs.config import Config
|
||||
from rvc.modules.uvr5.mdxnet import MDXNetDereverb
|
||||
from rvc.modules.uvr5.vr import AudioPre, AudioPreDeEcho
|
||||
from rvc.modules.uvr5.vr import AudioPreprocess
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -23,37 +23,18 @@ class UVR:
|
||||
def uvr_wrapper(
|
||||
self,
|
||||
audio_path: Path,
|
||||
save_vocal_path: Path | None = None,
|
||||
save_ins_path: Path | None = None,
|
||||
agg: int = 10,
|
||||
export_format: str = "flac",
|
||||
model_name: str | None = None,
|
||||
temp_path: Path | None = None,
|
||||
temp_dir: Path | None = None,
|
||||
):
|
||||
infos = []
|
||||
save_vocal_path = (
|
||||
os.getenv("save_uvr_path") if not save_vocal_path else save_vocal_path
|
||||
)
|
||||
save_ins_path = (
|
||||
os.getenv("save_uvr_path") if not save_ins_path else save_ins_path
|
||||
)
|
||||
|
||||
infos = list()
|
||||
if model_name is None:
|
||||
model_name = os.path.basename(glob(f"{os.getenv('weight_uvr5_root')}/*")[0])
|
||||
is_hp3 = "HP3" in model_name
|
||||
|
||||
if model_name == "onnx_dereverb_By_FoxJoy":
|
||||
pre_fun = MDXNetDereverb(15, self.config.device)
|
||||
else:
|
||||
func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
|
||||
pre_fun = func(
|
||||
agg=int(agg),
|
||||
model_path=os.path.join(
|
||||
os.getenv("weight_uvr5_root"), model_name # + ".pth"
|
||||
),
|
||||
device=self.config.device,
|
||||
is_half=self.config.is_half,
|
||||
)
|
||||
pre_fun = AudioPreprocess(
|
||||
os.path.join(os.getenv("weight_uvr5_root"), model_name), # + ".pth"
|
||||
int(agg),
|
||||
)
|
||||
|
||||
process_paths = (
|
||||
[
|
||||
@@ -65,12 +46,14 @@ class UVR:
|
||||
else audio_path
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
for process_path in [process_paths]:
|
||||
print(f"path: {process_path}")
|
||||
info = sf.info(process_path)
|
||||
if not (info.channels == 2 and info.samplerate == "44100"):
|
||||
tmp_path = os.path.join(
|
||||
temp_path or os.environ.get("TEMP"), os.path.basename(process_path)
|
||||
temp_dir or os.environ.get("TEMP"), os.path.basename(process_path)
|
||||
)
|
||||
AudioSegment.from_file(process_path).export(
|
||||
tmp_path,
|
||||
@@ -80,14 +63,16 @@ class UVR:
|
||||
parameters=["-ar", "44100"],
|
||||
)
|
||||
|
||||
pre_fun._path_audio_(
|
||||
process_path,
|
||||
save_vocal_path,
|
||||
save_ins_path,
|
||||
export_format,
|
||||
is_hp3=is_hp3,
|
||||
results.append(
|
||||
|
||||
pre_fun.process(
|
||||
tmp_path or process_path,
|
||||
)
|
||||
|
||||
)
|
||||
infos.append(f"{os.path.basename(process_path)}->Success")
|
||||
yield "\n".join(infos)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return results
|
||||
|
||||
@@ -6,6 +6,7 @@ import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
|
||||
from rvc.configs.config import Config
|
||||
from rvc.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
|
||||
from rvc.lib.uvr5_pack.lib_v5 import spec_utils
|
||||
from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
|
||||
@@ -15,10 +16,9 @@ from rvc.lib.uvr5_pack.utils import inference
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AudioPre:
|
||||
def __init__(self, agg, model_path, device, is_half, tta=False):
|
||||
class AudioPreprocess:
|
||||
def __init__(self, model_path, agg, tta=False):
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
self.data = {
|
||||
# Processing Options
|
||||
"postprocess": False,
|
||||
@@ -28,336 +28,110 @@ class AudioPre:
|
||||
"agg": agg,
|
||||
"high_end_process": "mirroring",
|
||||
}
|
||||
mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
|
||||
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
|
||||
cpk = torch.load(model_path, map_location="cpu")
|
||||
model.load_state_dict(cpk)
|
||||
model.eval()
|
||||
if is_half:
|
||||
model = model.half().to(device)
|
||||
else:
|
||||
model = model.to(device)
|
||||
self.config: Config = Config()
|
||||
self.version = 3 if "DeEcho" not in self.model_path else 2
|
||||
self.mp: ModelParameters = ModelParameters(
|
||||
f"rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v{self.version}.json"
|
||||
)
|
||||
self.model = (
|
||||
Nets.CascadedASPPNet(self.mp.param["bins"] * 2)
|
||||
if self.version == 3
|
||||
else CascadedNet(
|
||||
self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
|
||||
)
|
||||
.load_state_dict(torch.load(model_path, map_location="cpu"))
|
||||
.eval()
|
||||
)
|
||||
if self.config.is_half:
|
||||
self.model = self.model.half()
|
||||
self.model.to(self.config.device)
|
||||
|
||||
self.mp = mp
|
||||
self.model = model
|
||||
|
||||
def _path_audio_(
|
||||
self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
|
||||
def process(
|
||||
self,
|
||||
music_file,
|
||||
):
|
||||
name = os.path.basename(music_file)
|
||||
if (ins_root and vocal_root) is None:
|
||||
return "No save root."
|
||||
else:
|
||||
os.makedirs(ins_root, exist_ok=True)
|
||||
os.makedirs(vocal_root, exist_ok=True)
|
||||
|
||||
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
|
||||
x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
|
||||
bands_n = len(self.mp.param["band"])
|
||||
# print(bands_n)
|
||||
|
||||
for d in range(bands_n, 0, -1):
|
||||
bp = self.mp.param["band"][d]
|
||||
if d == bands_n: # high-end band
|
||||
# librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
|
||||
(
|
||||
X_wave[d],
|
||||
_,
|
||||
) = librosa.core.load(
|
||||
x_wave[d] = librosa.core.load(
|
||||
music_file,
|
||||
sr=bp["sr"],
|
||||
mono=False,
|
||||
dtype=np.float32,
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
if X_wave[d].ndim == 1:
|
||||
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
|
||||
)[0]
|
||||
if x_wave[d].ndim == 1:
|
||||
x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
|
||||
else: # lower bands
|
||||
X_wave[d] = librosa.core.resample(
|
||||
X_wave[d + 1],
|
||||
x_wave[d] = librosa.core.resample(
|
||||
x_wave[d + 1],
|
||||
orig_sr=self.mp.param["band"][d + 1]["sr"],
|
||||
target_sr=bp["sr"],
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
# Stft of wave source
|
||||
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
|
||||
X_wave[d],
|
||||
x_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
|
||||
x_wave[d],
|
||||
bp["hl"],
|
||||
bp["n_fft"],
|
||||
self.mp.param["mid_side"],
|
||||
self.mp.param["mid_side_b2"],
|
||||
self.mp.param["reverse"],
|
||||
)
|
||||
# pdb.set_trace()
|
||||
if d == bands_n and self.data["high_end_process"] != "none":
|
||||
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
|
||||
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
|
||||
)
|
||||
input_high_end = X_spec_s[d][
|
||||
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
|
||||
]
|
||||
|
||||
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
|
||||
# pdb.set_trace()
|
||||
|
||||
input_high_end_h = (
|
||||
self.mp.param["band"][1]["n_fft"] // 2
|
||||
- self.mp.param["band"][1]["crop_stop"]
|
||||
) + (self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"])
|
||||
input_high_end = x_spec_s[1][
|
||||
:,
|
||||
self.mp.param["band"][1]["n_fft"] // 2
|
||||
- input_high_end_h : self.mp.param["band"][1]["n_fft"] // 2,
|
||||
:,
|
||||
]
|
||||
x_spec_m = spec_utils.combine_spectrograms(x_spec_s, self.mp)
|
||||
aggresive_set = float(self.data["agg"] / 100)
|
||||
aggressiveness = {
|
||||
"value": aggresive_set,
|
||||
"split_bin": self.mp.param["band"][1]["crop_stop"],
|
||||
}
|
||||
with torch.no_grad():
|
||||
pred, X_mag, X_phase = inference(
|
||||
X_spec_m, self.device, self.model, aggressiveness, self.data
|
||||
pred, x_mag, x_phase = inference(
|
||||
x_spec_m, self.config.device, self.model, aggressiveness, self.data
|
||||
)
|
||||
# Postprocess
|
||||
if self.data["postprocess"]:
|
||||
pred_inv = np.clip(X_mag - pred, 0, np.inf)
|
||||
pred_inv = np.clip(x_mag - pred, 0, np.inf)
|
||||
pred = spec_utils.mask_silence(pred, pred_inv)
|
||||
y_spec_m = pred * X_phase
|
||||
v_spec_m = X_spec_m - y_spec_m
|
||||
y_spec_m = pred * x_phase
|
||||
v_spec_m = x_spec_m - y_spec_m
|
||||
|
||||
if ins_root is not None:
|
||||
if self.data["high_end_process"].startswith("mirroring"):
|
||||
input_high_end_ = spec_utils.mirroring(
|
||||
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
|
||||
)
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
|
||||
y_spec_m, self.mp, input_high_end_h, input_high_end_
|
||||
)
|
||||
else:
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
|
||||
logger.info("%s instruments done" % name)
|
||||
if is_hp3:
|
||||
head = "vocal_"
|
||||
else:
|
||||
head = "instrument_"
|
||||
if format in ["wav", "flac"]:
|
||||
sf.write(
|
||||
os.path.join(
|
||||
ins_root,
|
||||
head + f"{name}_{self.data['agg']}.{format}",
|
||||
),
|
||||
(np.array(wav_instrument) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
) #
|
||||
else:
|
||||
path = os.path.join(ins_root, head + f"{name}_{self.data['agg']}.wav")
|
||||
sf.write(
|
||||
path,
|
||||
(np.array(wav_instrument) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
)
|
||||
if os.path.exists(path):
|
||||
opt_format_path = path[:-4] + ".%s" % format
|
||||
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
|
||||
if os.path.exists(opt_format_path):
|
||||
try:
|
||||
os.remove(path)
|
||||
except Exception:
|
||||
pass
|
||||
if vocal_root is not None:
|
||||
head = "instrument_" if is_hp3 else "vocal_"
|
||||
if self.data["high_end_process"].startswith("mirroring"):
|
||||
input_high_end_ = spec_utils.mirroring(
|
||||
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
|
||||
)
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
|
||||
v_spec_m, self.mp, input_high_end_h, input_high_end_
|
||||
)
|
||||
else:
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
|
||||
logger.info(f"{name} vocals done")
|
||||
if format in ["wav", "flac"]:
|
||||
sf.write(
|
||||
os.path.join(
|
||||
vocal_root,
|
||||
head + f"{name}_{self.data['agg']}.{format}",
|
||||
),
|
||||
(np.array(wav_vocals) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
)
|
||||
else:
|
||||
path = os.path.join(vocal_root, head + f"{name}_{self.data['agg']}.wav")
|
||||
sf.write(
|
||||
path,
|
||||
(np.array(wav_vocals) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
)
|
||||
if os.path.exists(path):
|
||||
opt_format_path = path[:-4] + f".{format}"
|
||||
os.system(f"ffmpeg -i {path} -vn {opt_format_path} -q:a 2 -y")
|
||||
if os.path.exists(opt_format_path):
|
||||
try:
|
||||
os.remove(path)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
class AudioPreDeEcho:
|
||||
def __init__(self, agg, model_path, device, is_half, tta=False):
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
self.data = {
|
||||
# Processing Options
|
||||
"postprocess": False,
|
||||
"tta": tta,
|
||||
# Constants
|
||||
"window_size": 512,
|
||||
"agg": agg,
|
||||
"high_end_process": "mirroring",
|
||||
}
|
||||
mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
|
||||
nout = 64 if "DeReverb" in model_path else 48
|
||||
model = CascadedNet(mp.param["bins"] * 2, nout)
|
||||
cpk = torch.load(model_path, map_location="cpu")
|
||||
model.load_state_dict(cpk)
|
||||
model.eval()
|
||||
if is_half:
|
||||
model = model.half().to(device)
|
||||
else:
|
||||
model = model.to(device)
|
||||
|
||||
self.mp = mp
|
||||
self.model = model
|
||||
|
||||
def _path_audio_(
|
||||
self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
|
||||
): # 3个VR模型vocal和ins是反的
|
||||
name = os.path.basename(music_file)
|
||||
if (ins_root and vocal_root) is None:
|
||||
return "No save root."
|
||||
else:
|
||||
os.makedirs(ins_root, exist_ok=True)
|
||||
os.makedirs(vocal_root, exist_ok=True)
|
||||
|
||||
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
|
||||
bands_n = len(self.mp.param["band"])
|
||||
# print(bands_n)
|
||||
for d in range(bands_n, 0, -1):
|
||||
bp = self.mp.param["band"][d]
|
||||
if d == bands_n: # high-end band
|
||||
# librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
|
||||
(
|
||||
X_wave[d],
|
||||
_,
|
||||
) = librosa.core.load(
|
||||
music_file,
|
||||
bp["sr"],
|
||||
False,
|
||||
dtype=np.float32,
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
if X_wave[d].ndim == 1:
|
||||
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
|
||||
else: # lower bands
|
||||
X_wave[d] = librosa.core.resample(
|
||||
X_wave[d + 1],
|
||||
self.mp.param["band"][d + 1]["sr"],
|
||||
bp["sr"],
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
# Stft of wave source
|
||||
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
|
||||
X_wave[d],
|
||||
bp["hl"],
|
||||
bp["n_fft"],
|
||||
self.mp.param["mid_side"],
|
||||
self.mp.param["mid_side_b2"],
|
||||
self.mp.param["reverse"],
|
||||
if self.data["high_end_process"].startswith("mirroring"):
|
||||
input_high_end_ = spec_utils.mirroring(
|
||||
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
|
||||
)
|
||||
# pdb.set_trace()
|
||||
if d == bands_n and self.data["high_end_process"] != "none":
|
||||
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
|
||||
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
|
||||
)
|
||||
input_high_end = X_spec_s[d][
|
||||
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
|
||||
]
|
||||
|
||||
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
|
||||
aggresive_set = float(self.data["agg"] / 100)
|
||||
aggressiveness = {
|
||||
"value": aggresive_set,
|
||||
"split_bin": self.mp.param["band"][1]["crop_stop"],
|
||||
}
|
||||
with torch.no_grad():
|
||||
pred, X_mag, X_phase = inference(
|
||||
X_spec_m, self.device, self.model, aggressiveness, self.data
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
|
||||
y_spec_m, self.mp, input_high_end_h, input_high_end_
|
||||
)
|
||||
# Postprocess
|
||||
if self.data["postprocess"]:
|
||||
pred_inv = np.clip(X_mag - pred, 0, np.inf)
|
||||
pred = spec_utils.mask_silence(pred, pred_inv)
|
||||
y_spec_m = pred * X_phase
|
||||
v_spec_m = X_spec_m - y_spec_m
|
||||
input_high_end_ = spec_utils.mirroring(
|
||||
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
|
||||
)
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
|
||||
v_spec_m, self.mp, input_high_end_h, input_high_end_
|
||||
)
|
||||
else:
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
|
||||
|
||||
if ins_root is not None:
|
||||
if self.data["high_end_process"].startswith("mirroring"):
|
||||
input_high_end_ = spec_utils.mirroring(
|
||||
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
|
||||
)
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
|
||||
y_spec_m, self.mp, input_high_end_h, input_high_end_
|
||||
)
|
||||
else:
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
|
||||
logger.info("%s instruments done" % name)
|
||||
if format in ["wav", "flac"]:
|
||||
sf.write(
|
||||
os.path.join(
|
||||
ins_root,
|
||||
"instrument_{}_{}.{}".format(name, self.data["agg"], format),
|
||||
),
|
||||
(np.array(wav_instrument) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
) #
|
||||
else:
|
||||
path = os.path.join(
|
||||
ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
|
||||
)
|
||||
sf.write(
|
||||
path,
|
||||
(np.array(wav_instrument) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
)
|
||||
if os.path.exists(path):
|
||||
opt_format_path = path[:-4] + ".%s" % format
|
||||
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
|
||||
if os.path.exists(opt_format_path):
|
||||
try:
|
||||
os.remove(path)
|
||||
except:
|
||||
pass
|
||||
if vocal_root is not None:
|
||||
if self.data["high_end_process"].startswith("mirroring"):
|
||||
input_high_end_ = spec_utils.mirroring(
|
||||
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
|
||||
)
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
|
||||
v_spec_m, self.mp, input_high_end_h, input_high_end_
|
||||
)
|
||||
else:
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
|
||||
logger.info("%s vocals done" % name)
|
||||
if format in ["wav", "flac"]:
|
||||
sf.write(
|
||||
os.path.join(
|
||||
vocal_root,
|
||||
"vocal_{}_{}.{}".format(name, self.data["agg"], format),
|
||||
),
|
||||
(np.array(wav_vocals) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
)
|
||||
else:
|
||||
path = os.path.join(
|
||||
vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
|
||||
)
|
||||
sf.write(
|
||||
path,
|
||||
(np.array(wav_vocals) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
)
|
||||
if os.path.exists(path):
|
||||
opt_format_path = path[:-4] + ".%s" % format
|
||||
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
|
||||
if os.path.exists(opt_format_path):
|
||||
try:
|
||||
os.remove(path)
|
||||
except:
|
||||
pass
|
||||
return (
|
||||
(np.array(wav_instrument) * 32768).astype("int16"),
|
||||
(np.array(wav_vocals) * 32768).astype("int16"),
|
||||
self.mp.param["sr"],
|
||||
self.data["agg"],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user