From 0c3c512396825ac418618f5235e4863dbba41e92 Mon Sep 17 00:00:00 2001 From: Ftps Date: Thu, 7 Mar 2024 21:42:07 +0900 Subject: [PATCH] rewrite vr-architecture --- pyproject.toml | 2 +- rvc/modules/uvr5/mdxnet.py | 2 +- rvc/modules/uvr5/modules.py | 53 ++---- rvc/modules/uvr5/vr.py | 368 +++++++----------------------------- 4 files changed, 92 insertions(+), 333 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5f5a450..c9bd07a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "rvc" -version = "0.3.4" +version = "0.3.5" description = "An easy-to-use Voice Conversion framework based on VITS." authors = ["Ftps "] readme = "README.md" diff --git a/rvc/modules/uvr5/mdxnet.py b/rvc/modules/uvr5/mdxnet.py index 443bf36..f339cd3 100644 --- a/rvc/modules/uvr5/mdxnet.py +++ b/rvc/modules/uvr5/mdxnet.py @@ -243,7 +243,7 @@ class MDXNetDereverb: self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy" self.shifts = 10 # 'Predict with randomised equivariant stabilisation' self.mixing = "min_mag" # ['default','min_mag','max_mag'] - self.chunks = chunks + self.chunks = chunks # 15 self.margin = 44100 self.dim_t = 9 self.dim_f = 3072 diff --git a/rvc/modules/uvr5/modules.py b/rvc/modules/uvr5/modules.py index cb61d54..224580c 100644 --- a/rvc/modules/uvr5/modules.py +++ b/rvc/modules/uvr5/modules.py @@ -10,7 +10,7 @@ from pydub import AudioSegment from rvc.configs.config import Config from rvc.modules.uvr5.mdxnet import MDXNetDereverb -from rvc.modules.uvr5.vr import AudioPre, AudioPreDeEcho +from rvc.modules.uvr5.vr import AudioPreprocess logger: logging.Logger = logging.getLogger(__name__) @@ -23,37 +23,18 @@ class UVR: def uvr_wrapper( self, audio_path: Path, - save_vocal_path: Path | None = None, - save_ins_path: Path | None = None, agg: int = 10, - export_format: str = "flac", model_name: str | None = None, - temp_path: Path | None = None, + temp_dir: Path | None = None, ): - infos = [] - save_vocal_path = ( - os.getenv("save_uvr_path") if not save_vocal_path else save_vocal_path - ) - save_ins_path = ( - os.getenv("save_uvr_path") if not save_ins_path else save_ins_path - ) - + infos = list() if model_name is None: model_name = os.path.basename(glob(f"{os.getenv('weight_uvr5_root')}/*")[0]) - is_hp3 = "HP3" in model_name - if model_name == "onnx_dereverb_By_FoxJoy": - pre_fun = MDXNetDereverb(15, self.config.device) - else: - func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho - pre_fun = func( - agg=int(agg), - model_path=os.path.join( - os.getenv("weight_uvr5_root"), model_name # + ".pth" - ), - device=self.config.device, - is_half=self.config.is_half, - ) + pre_fun = AudioPreprocess( + os.path.join(os.getenv("weight_uvr5_root"), model_name), # + ".pth" + int(agg), + ) process_paths = ( [ @@ -65,12 +46,14 @@ class UVR: else audio_path ) + results = [] + for process_path in [process_paths]: print(f"path: {process_path}") info = sf.info(process_path) if not (info.channels == 2 and info.samplerate == "44100"): tmp_path = os.path.join( - temp_path or os.environ.get("TEMP"), os.path.basename(process_path) + temp_dir or os.environ.get("TEMP"), os.path.basename(process_path) ) AudioSegment.from_file(process_path).export( tmp_path, @@ -80,14 +63,16 @@ class UVR: parameters=["-ar", "44100"], ) - pre_fun._path_audio_( - process_path, - save_vocal_path, - save_ins_path, - export_format, - is_hp3=is_hp3, + results.append( + + pre_fun.process( + tmp_path or process_path, + ) + ) infos.append(f"{os.path.basename(process_path)}->Success") - yield "\n".join(infos) + if torch.cuda.is_available(): torch.cuda.empty_cache() + + return results diff --git a/rvc/modules/uvr5/vr.py b/rvc/modules/uvr5/vr.py index a08b066..caebe31 100644 --- a/rvc/modules/uvr5/vr.py +++ b/rvc/modules/uvr5/vr.py @@ -6,6 +6,7 @@ import numpy as np import soundfile as sf import torch +from rvc.configs.config import Config from rvc.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets from rvc.lib.uvr5_pack.lib_v5 import spec_utils from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters @@ -15,10 +16,9 @@ from rvc.lib.uvr5_pack.utils import inference logger = logging.getLogger(__name__) -class AudioPre: - def __init__(self, agg, model_path, device, is_half, tta=False): +class AudioPreprocess: + def __init__(self, model_path, agg, tta=False): self.model_path = model_path - self.device = device self.data = { # Processing Options "postprocess": False, @@ -28,336 +28,110 @@ class AudioPre: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") - model = Nets.CascadedASPPNet(mp.param["bins"] * 2) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) + self.config: Config = Config() + self.version = 3 if "DeEcho" not in self.model_path else 2 + self.mp: ModelParameters = ModelParameters( + f"rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v{self.version}.json" + ) + self.model = ( + Nets.CascadedASPPNet(self.mp.param["bins"] * 2) + if self.version == 3 + else CascadedNet( + self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48 + ) + .load_state_dict(torch.load(model_path, map_location="cpu")) + .eval() + ) + if self.config.is_half: + self.model = self.model.half() + self.model.to(self.config.device) - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False + def process( + self, + music_file, ): - name = os.path.basename(music_file) - if (ins_root and vocal_root) is None: - return "No save root." - else: - os.makedirs(ins_root, exist_ok=True) - os.makedirs(vocal_root, exist_ok=True) - - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {} bands_n = len(self.mp.param["band"]) - # print(bands_n) + for d in range(bands_n, 0, -1): bp = self.mp.param["band"][d] if d == bands_n: # high-end band # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain - ( - X_wave[d], - _, - ) = librosa.core.load( + x_wave[d] = librosa.core.load( music_file, sr=bp["sr"], mono=False, dtype=np.float32, res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + )[0] + if x_wave[d].ndim == 1: + x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]]) else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], + x_wave[d] = librosa.core.resample( + x_wave[d + 1], orig_sr=self.mp.param["band"][d + 1]["sr"], target_sr=bp["sr"], res_type=bp["res_type"], ) # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], + x_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + x_wave[d], bp["hl"], bp["n_fft"], self.mp.param["mid_side"], self.mp.param["mid_side_b2"], self.mp.param["reverse"], ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + # pdb.set_trace() + + input_high_end_h = ( + self.mp.param["band"][1]["n_fft"] // 2 + - self.mp.param["band"][1]["crop_stop"] + ) + (self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]) + input_high_end = x_spec_s[1][ + :, + self.mp.param["band"][1]["n_fft"] // 2 + - input_high_end_h : self.mp.param["band"][1]["n_fft"] // 2, + :, + ] + x_spec_m = spec_utils.combine_spectrograms(x_spec_s, self.mp) aggresive_set = float(self.data["agg"] / 100) aggressiveness = { "value": aggresive_set, "split_bin": self.mp.param["band"][1]["crop_stop"], } with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data + pred, x_mag, x_phase = inference( + x_spec_m, self.config.device, self.model, aggressiveness, self.data ) # Postprocess if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred_inv = np.clip(x_mag - pred, 0, np.inf) pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m + y_spec_m = pred * x_phase + v_spec_m = x_spec_m - y_spec_m - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - logger.info("%s instruments done" % name) - if is_hp3: - head = "vocal_" - else: - head = "instrument_" - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - head + f"{name}_{self.data['agg']}.{format}", - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join(ins_root, head + f"{name}_{self.data['agg']}.wav") - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except Exception: - pass - if vocal_root is not None: - head = "instrument_" if is_hp3 else "vocal_" - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - logger.info(f"{name} vocals done") - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - head + f"{name}_{self.data['agg']}.{format}", - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join(vocal_root, head + f"{name}_{self.data['agg']}.wav") - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + f".{format}" - os.system(f"ffmpeg -i {path} -vn {opt_format_path} -q:a 2 -y") - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - - -class AudioPreDeEcho: - def __init__(self, agg, model_path, device, is_half, tta=False): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": tta, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") - nout = 64 if "DeReverb" in model_path else 48 - model = CascadedNet(mp.param["bins"] * 2, nout) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False - ): # 3个VR模型vocal和ins是反的 - name = os.path.basename(music_file) - if (ins_root and vocal_root) is None: - return "No save root." - else: - os.makedirs(ins_root, exist_ok=True) - os.makedirs(vocal_root, exist_ok=True) - - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain - ( - X_wave[d], - _, - ) = librosa.core.load( - music_file, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - self.mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - logger.info("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - logger.info("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass + return ( + (np.array(wav_instrument) * 32768).astype("int16"), + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + self.data["agg"], + )