From 0c3c512396825ac418618f5235e4863dbba41e92 Mon Sep 17 00:00:00 2001
From: Ftps <ftpsflandre@gmail.com>
Date: Thu, 7 Mar 2024 21:42:07 +0900
Subject: [PATCH] rewrite vr-architecture

---
 pyproject.toml              |   2 +-
 rvc/modules/uvr5/mdxnet.py  |   2 +-
 rvc/modules/uvr5/modules.py |  53 ++----
 rvc/modules/uvr5/vr.py      | 368 +++++++-----------------------------
 4 files changed, 92 insertions(+), 333 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5f5a450..c9bd07a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "rvc"
-version = "0.3.4"
+version = "0.3.5"
 description = "An easy-to-use Voice Conversion framework based on VITS."
 authors = ["Ftps <ftpsflandre@gmail.com>"]
 readme = "README.md"
diff --git a/rvc/modules/uvr5/mdxnet.py b/rvc/modules/uvr5/mdxnet.py
index 443bf36..f339cd3 100644
--- a/rvc/modules/uvr5/mdxnet.py
+++ b/rvc/modules/uvr5/mdxnet.py
@@ -243,7 +243,7 @@ class MDXNetDereverb:
         self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy"
         self.shifts = 10  # 'Predict with randomised equivariant stabilisation'
         self.mixing = "min_mag"  # ['default','min_mag','max_mag']
-        self.chunks = chunks
+        self.chunks = chunks  # 15
         self.margin = 44100
         self.dim_t = 9
         self.dim_f = 3072
diff --git a/rvc/modules/uvr5/modules.py b/rvc/modules/uvr5/modules.py
index cb61d54..224580c 100644
--- a/rvc/modules/uvr5/modules.py
+++ b/rvc/modules/uvr5/modules.py
@@ -10,7 +10,7 @@ from pydub import AudioSegment
 
 from rvc.configs.config import Config
 from rvc.modules.uvr5.mdxnet import MDXNetDereverb
-from rvc.modules.uvr5.vr import AudioPre, AudioPreDeEcho
+from rvc.modules.uvr5.vr import AudioPreprocess
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -23,37 +23,18 @@ class UVR:
     def uvr_wrapper(
         self,
         audio_path: Path,
-        save_vocal_path: Path | None = None,
-        save_ins_path: Path | None = None,
         agg: int = 10,
-        export_format: str = "flac",
         model_name: str | None = None,
-        temp_path: Path | None = None,
+        temp_dir: Path | None = None,
     ):
-        infos = []
-        save_vocal_path = (
-            os.getenv("save_uvr_path") if not save_vocal_path else save_vocal_path
-        )
-        save_ins_path = (
-            os.getenv("save_uvr_path") if not save_ins_path else save_ins_path
-        )
-
+        infos = list()
         if model_name is None:
             model_name = os.path.basename(glob(f"{os.getenv('weight_uvr5_root')}/*")[0])
-        is_hp3 = "HP3" in model_name
 
-        if model_name == "onnx_dereverb_By_FoxJoy":
-            pre_fun = MDXNetDereverb(15, self.config.device)
-        else:
-            func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
-            pre_fun = func(
-                agg=int(agg),
-                model_path=os.path.join(
-                    os.getenv("weight_uvr5_root"), model_name  # + ".pth"
-                ),
-                device=self.config.device,
-                is_half=self.config.is_half,
-            )
+        pre_fun = AudioPreprocess(
+            os.path.join(os.getenv("weight_uvr5_root"), model_name),  # + ".pth"
+            int(agg),
+        )
 
         process_paths = (
             [
@@ -65,12 +46,14 @@ class UVR:
             else audio_path
         )
 
+        results = []
+
         for process_path in [process_paths]:
             print(f"path: {process_path}")
             info = sf.info(process_path)
             if not (info.channels == 2 and info.samplerate == "44100"):
                 tmp_path = os.path.join(
-                    temp_path or os.environ.get("TEMP"), os.path.basename(process_path)
+                    temp_dir or os.environ.get("TEMP"), os.path.basename(process_path)
                 )
                 AudioSegment.from_file(process_path).export(
                     tmp_path,
@@ -80,14 +63,16 @@ class UVR:
                     parameters=["-ar", "44100"],
                 )
 
-            pre_fun._path_audio_(
-                process_path,
-                save_vocal_path,
-                save_ins_path,
-                export_format,
-                is_hp3=is_hp3,
+            results.append(
+
+                    pre_fun.process(
+                        tmp_path or process_path,
+                    )
+                
             )
             infos.append(f"{os.path.basename(process_path)}->Success")
-            yield "\n".join(infos)
+
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+
+        return results
diff --git a/rvc/modules/uvr5/vr.py b/rvc/modules/uvr5/vr.py
index a08b066..caebe31 100644
--- a/rvc/modules/uvr5/vr.py
+++ b/rvc/modules/uvr5/vr.py
@@ -6,6 +6,7 @@ import numpy as np
 import soundfile as sf
 import torch
 
+from rvc.configs.config import Config
 from rvc.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
 from rvc.lib.uvr5_pack.lib_v5 import spec_utils
 from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
@@ -15,10 +16,9 @@ from rvc.lib.uvr5_pack.utils import inference
 logger = logging.getLogger(__name__)
 
 
-class AudioPre:
-    def __init__(self, agg, model_path, device, is_half, tta=False):
+class AudioPreprocess:
+    def __init__(self, model_path, agg, tta=False):
         self.model_path = model_path
-        self.device = device
         self.data = {
             # Processing Options
             "postprocess": False,
@@ -28,336 +28,110 @@ class AudioPre:
             "agg": agg,
             "high_end_process": "mirroring",
         }
-        mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
-        model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
-        cpk = torch.load(model_path, map_location="cpu")
-        model.load_state_dict(cpk)
-        model.eval()
-        if is_half:
-            model = model.half().to(device)
-        else:
-            model = model.to(device)
+        self.config: Config = Config()
+        self.version = 3 if "DeEcho" not in self.model_path else 2
+        self.mp: ModelParameters = ModelParameters(
+            f"rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v{self.version}.json"
+        )
+        self.model = (
+            Nets.CascadedASPPNet(self.mp.param["bins"] * 2)
+            if self.version == 3
+            else CascadedNet(
+                self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
+            )
+            .load_state_dict(torch.load(model_path, map_location="cpu"))
+            .eval()
+        )
+        if self.config.is_half:
+            self.model = self.model.half()
+        self.model.to(self.config.device)
 
-        self.mp = mp
-        self.model = model
-
-    def _path_audio_(
-        self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
+    def process(
+        self,
+        music_file,
     ):
-        name = os.path.basename(music_file)
-        if (ins_root and vocal_root) is None:
-            return "No save root."
-        else:
-            os.makedirs(ins_root, exist_ok=True)
-            os.makedirs(vocal_root, exist_ok=True)
-
-        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+        x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
         bands_n = len(self.mp.param["band"])
-        # print(bands_n)
+
         for d in range(bands_n, 0, -1):
             bp = self.mp.param["band"][d]
             if d == bands_n:  # high-end band
                 # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
-                (
-                    X_wave[d],
-                    _,
-                ) = librosa.core.load(
+                x_wave[d] = librosa.core.load(
                     music_file,
                     sr=bp["sr"],
                     mono=False,
                     dtype=np.float32,
                     res_type=bp["res_type"],
-                )
-                if X_wave[d].ndim == 1:
-                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+                )[0]
+                if x_wave[d].ndim == 1:
+                    x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
             else:  # lower bands
-                X_wave[d] = librosa.core.resample(
-                    X_wave[d + 1],
+                x_wave[d] = librosa.core.resample(
+                    x_wave[d + 1],
                     orig_sr=self.mp.param["band"][d + 1]["sr"],
                     target_sr=bp["sr"],
                     res_type=bp["res_type"],
                 )
             # Stft of wave source
-            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
-                X_wave[d],
+            x_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+                x_wave[d],
                 bp["hl"],
                 bp["n_fft"],
                 self.mp.param["mid_side"],
                 self.mp.param["mid_side_b2"],
                 self.mp.param["reverse"],
             )
-            # pdb.set_trace()
-            if d == bands_n and self.data["high_end_process"] != "none":
-                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
-                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
-                )
-                input_high_end = X_spec_s[d][
-                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
-                ]
 
-        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+            # pdb.set_trace()
+
+        input_high_end_h = (
+            self.mp.param["band"][1]["n_fft"] // 2
+            - self.mp.param["band"][1]["crop_stop"]
+        ) + (self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"])
+        input_high_end = x_spec_s[1][
+            :,
+            self.mp.param["band"][1]["n_fft"] // 2
+            - input_high_end_h : self.mp.param["band"][1]["n_fft"] // 2,
+            :,
+        ]
+        x_spec_m = spec_utils.combine_spectrograms(x_spec_s, self.mp)
         aggresive_set = float(self.data["agg"] / 100)
         aggressiveness = {
             "value": aggresive_set,
             "split_bin": self.mp.param["band"][1]["crop_stop"],
         }
         with torch.no_grad():
-            pred, X_mag, X_phase = inference(
-                X_spec_m, self.device, self.model, aggressiveness, self.data
+            pred, x_mag, x_phase = inference(
+                x_spec_m, self.config.device, self.model, aggressiveness, self.data
             )
         # Postprocess
         if self.data["postprocess"]:
-            pred_inv = np.clip(X_mag - pred, 0, np.inf)
+            pred_inv = np.clip(x_mag - pred, 0, np.inf)
             pred = spec_utils.mask_silence(pred, pred_inv)
-        y_spec_m = pred * X_phase
-        v_spec_m = X_spec_m - y_spec_m
+        y_spec_m = pred * x_phase
+        v_spec_m = x_spec_m - y_spec_m
 
-        if ins_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
-                )
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
-                    y_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
-            logger.info("%s instruments done" % name)
-            if is_hp3:
-                head = "vocal_"
-            else:
-                head = "instrument_"
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        ins_root,
-                        head + f"{name}_{self.data['agg']}.{format}",
-                    ),
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )  #
-            else:
-                path = os.path.join(ins_root, head + f"{name}_{self.data['agg']}.wav")
-                sf.write(
-                    path,
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except Exception:
-                            pass
-        if vocal_root is not None:
-            head = "instrument_" if is_hp3 else "vocal_"
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
-                )
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
-                    v_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
-            logger.info(f"{name} vocals done")
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        vocal_root,
-                        head + f"{name}_{self.data['agg']}.{format}",
-                    ),
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(vocal_root, head + f"{name}_{self.data['agg']}.wav")
-                sf.write(
-                    path,
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + f".{format}"
-                    os.system(f"ffmpeg -i {path} -vn {opt_format_path} -q:a 2 -y")
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
-
-
-class AudioPreDeEcho:
-    def __init__(self, agg, model_path, device, is_half, tta=False):
-        self.model_path = model_path
-        self.device = device
-        self.data = {
-            # Processing Options
-            "postprocess": False,
-            "tta": tta,
-            # Constants
-            "window_size": 512,
-            "agg": agg,
-            "high_end_process": "mirroring",
-        }
-        mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
-        nout = 64 if "DeReverb" in model_path else 48
-        model = CascadedNet(mp.param["bins"] * 2, nout)
-        cpk = torch.load(model_path, map_location="cpu")
-        model.load_state_dict(cpk)
-        model.eval()
-        if is_half:
-            model = model.half().to(device)
-        else:
-            model = model.to(device)
-
-        self.mp = mp
-        self.model = model
-
-    def _path_audio_(
-        self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
-    ):  # 3个VR模型vocal和ins是反的
-        name = os.path.basename(music_file)
-        if (ins_root and vocal_root) is None:
-            return "No save root."
-        else:
-            os.makedirs(ins_root, exist_ok=True)
-            os.makedirs(vocal_root, exist_ok=True)
-
-        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
-        bands_n = len(self.mp.param["band"])
-        # print(bands_n)
-        for d in range(bands_n, 0, -1):
-            bp = self.mp.param["band"][d]
-            if d == bands_n:  # high-end band
-                # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
-                (
-                    X_wave[d],
-                    _,
-                ) = librosa.core.load(
-                    music_file,
-                    bp["sr"],
-                    False,
-                    dtype=np.float32,
-                    res_type=bp["res_type"],
-                )
-                if X_wave[d].ndim == 1:
-                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
-            else:  # lower bands
-                X_wave[d] = librosa.core.resample(
-                    X_wave[d + 1],
-                    self.mp.param["band"][d + 1]["sr"],
-                    bp["sr"],
-                    res_type=bp["res_type"],
-                )
-            # Stft of wave source
-            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
-                X_wave[d],
-                bp["hl"],
-                bp["n_fft"],
-                self.mp.param["mid_side"],
-                self.mp.param["mid_side_b2"],
-                self.mp.param["reverse"],
+        if self.data["high_end_process"].startswith("mirroring"):
+            input_high_end_ = spec_utils.mirroring(
+                self.data["high_end_process"], y_spec_m, input_high_end, self.mp
             )
-            # pdb.set_trace()
-            if d == bands_n and self.data["high_end_process"] != "none":
-                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
-                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
-                )
-                input_high_end = X_spec_s[d][
-                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
-                ]
-
-        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
-        aggresive_set = float(self.data["agg"] / 100)
-        aggressiveness = {
-            "value": aggresive_set,
-            "split_bin": self.mp.param["band"][1]["crop_stop"],
-        }
-        with torch.no_grad():
-            pred, X_mag, X_phase = inference(
-                X_spec_m, self.device, self.model, aggressiveness, self.data
+            wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+                y_spec_m, self.mp, input_high_end_h, input_high_end_
             )
-        # Postprocess
-        if self.data["postprocess"]:
-            pred_inv = np.clip(X_mag - pred, 0, np.inf)
-            pred = spec_utils.mask_silence(pred, pred_inv)
-        y_spec_m = pred * X_phase
-        v_spec_m = X_spec_m - y_spec_m
+            input_high_end_ = spec_utils.mirroring(
+                self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+            )
+            wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+                v_spec_m, self.mp, input_high_end_h, input_high_end_
+            )
+        else:
+            wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+            wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
 
-        if ins_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
-                )
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
-                    y_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
-            logger.info("%s instruments done" % name)
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        ins_root,
-                        "instrument_{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )  #
-            else:
-                path = os.path.join(
-                    ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
-        if vocal_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
-                )
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
-                    v_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
-            logger.info("%s vocals done" % name)
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        vocal_root,
-                        "vocal_{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(
-                    vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
+        return (
+            (np.array(wav_instrument) * 32768).astype("int16"),
+            (np.array(wav_vocals) * 32768).astype("int16"),
+            self.mp.param["sr"],
+            self.data["agg"],
+        )