rewrite vr-architecture

2024-03-07 21:42:07 +09:00
parent 1b2808e332
commit 0c3c512396
4 changed files with 92 additions and 333 deletions
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "rvc"
-version = "0.3.4"
+version = "0.3.5"
 description = "An easy-to-use Voice Conversion framework based on VITS."
 authors = ["Ftps <ftpsflandre@gmail.com>"]
 readme = "README.md"
@@ -243,7 +243,7 @@ class MDXNetDereverb:
        self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy"
        self.shifts = 10  # 'Predict with randomised equivariant stabilisation'
        self.mixing = "min_mag"  # ['default','min_mag','max_mag']
-        self.chunks = chunks
+        self.chunks = chunks  # 15
        self.margin = 44100
        self.dim_t = 9
        self.dim_f = 3072
@@ -10,7 +10,7 @@ from pydub import AudioSegment

 from rvc.configs.config import Config
 from rvc.modules.uvr5.mdxnet import MDXNetDereverb
-from rvc.modules.uvr5.vr import AudioPre, AudioPreDeEcho
+from rvc.modules.uvr5.vr import AudioPreprocess

 logger: logging.Logger = logging.getLogger(__name__)

@@ -23,37 +23,18 @@ class UVR:
    def uvr_wrapper(
        self,
        audio_path: Path,
-        save_vocal_path: Path | None = None,
-        save_ins_path: Path | None = None,
        agg: int = 10,
-        export_format: str = "flac",
        model_name: str | None = None,
-        temp_path: Path | None = None,
+        temp_dir: Path | None = None,
    ):
-        infos = []
-        save_vocal_path = (
-            os.getenv("save_uvr_path") if not save_vocal_path else save_vocal_path
-        )
-        save_ins_path = (
-            os.getenv("save_uvr_path") if not save_ins_path else save_ins_path
-        )
-
+        infos = list()
        if model_name is None:
            model_name = os.path.basename(glob(f"{os.getenv('weight_uvr5_root')}/*")[0])
-        is_hp3 = "HP3" in model_name

-        if model_name == "onnx_dereverb_By_FoxJoy":
-            pre_fun = MDXNetDereverb(15, self.config.device)
-        else:
-            func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
-            pre_fun = func(
-                agg=int(agg),
-                model_path=os.path.join(
-                    os.getenv("weight_uvr5_root"), model_name  # + ".pth"
-                ),
-                device=self.config.device,
-                is_half=self.config.is_half,
-            )
+        pre_fun = AudioPreprocess(
+            os.path.join(os.getenv("weight_uvr5_root"), model_name),  # + ".pth"
+            int(agg),
+        )

        process_paths = (
            [
@@ -65,12 +46,14 @@ class UVR:
            else audio_path
        )

+        results = []
+
        for process_path in [process_paths]:
            print(f"path: {process_path}")
            info = sf.info(process_path)
            if not (info.channels == 2 and info.samplerate == "44100"):
                tmp_path = os.path.join(
-                    temp_path or os.environ.get("TEMP"), os.path.basename(process_path)
+                    temp_dir or os.environ.get("TEMP"), os.path.basename(process_path)
                )
                AudioSegment.from_file(process_path).export(
                    tmp_path,
@@ -80,14 +63,16 @@ class UVR:
                    parameters=["-ar", "44100"],
                )

-            pre_fun._path_audio_(
-                process_path,
-                save_vocal_path,
-                save_ins_path,
-                export_format,
-                is_hp3=is_hp3,
+            results.append(
+
+                    pre_fun.process(
+                        tmp_path or process_path,
+                    )
+                
            )
            infos.append(f"{os.path.basename(process_path)}->Success")
-            yield "\n".join(infos)
+
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
+
+        return results
@@ -6,6 +6,7 @@ import numpy as np
 import soundfile as sf
 import torch

+from rvc.configs.config import Config
 from rvc.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
 from rvc.lib.uvr5_pack.lib_v5 import spec_utils
 from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
@@ -15,10 +16,9 @@ from rvc.lib.uvr5_pack.utils import inference
 logger = logging.getLogger(__name__)


-class AudioPre:
-    def __init__(self, agg, model_path, device, is_half, tta=False):
+class AudioPreprocess:
+    def __init__(self, model_path, agg, tta=False):
        self.model_path = model_path
-        self.device = device
        self.data = {
            # Processing Options
            "postprocess": False,
@@ -28,336 +28,110 @@ class AudioPre:
            "agg": agg,
            "high_end_process": "mirroring",
        }
-        mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
-        model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
-        cpk = torch.load(model_path, map_location="cpu")
-        model.load_state_dict(cpk)
-        model.eval()
-        if is_half:
-            model = model.half().to(device)
-        else:
-            model = model.to(device)
+        self.config: Config = Config()
+        self.version = 3 if "DeEcho" not in self.model_path else 2
+        self.mp: ModelParameters = ModelParameters(
+            f"rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v{self.version}.json"
+        )
+        self.model = (
+            Nets.CascadedASPPNet(self.mp.param["bins"] * 2)
+            if self.version == 3
+            else CascadedNet(
+                self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
+            )
+            .load_state_dict(torch.load(model_path, map_location="cpu"))
+            .eval()
+        )
+        if self.config.is_half:
+            self.model = self.model.half()
+        self.model.to(self.config.device)

-        self.mp = mp
-        self.model = model
-
-    def _path_audio_(
-        self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
+    def process(
+        self,
+        music_file,
    ):
-        name = os.path.basename(music_file)
-        if (ins_root and vocal_root) is None:
-            return "No save root."
-        else:
-            os.makedirs(ins_root, exist_ok=True)
-            os.makedirs(vocal_root, exist_ok=True)
-
-        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+        x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
        bands_n = len(self.mp.param["band"])
-        # print(bands_n)
+
        for d in range(bands_n, 0, -1):
            bp = self.mp.param["band"][d]
            if d == bands_n:  # high-end band
                # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
-                (
-                    X_wave[d],
-                    _,
-                ) = librosa.core.load(
+                x_wave[d] = librosa.core.load(
                    music_file,
                    sr=bp["sr"],
                    mono=False,
                    dtype=np.float32,
                    res_type=bp["res_type"],
-                )
-                if X_wave[d].ndim == 1:
-                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+                )[0]
+                if x_wave[d].ndim == 1:
+                    x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
            else:  # lower bands
-                X_wave[d] = librosa.core.resample(
-                    X_wave[d + 1],
+                x_wave[d] = librosa.core.resample(
+                    x_wave[d + 1],
                    orig_sr=self.mp.param["band"][d + 1]["sr"],
                    target_sr=bp["sr"],
                    res_type=bp["res_type"],
                )
            # Stft of wave source
-            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
-                X_wave[d],
+            x_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+                x_wave[d],
                bp["hl"],
                bp["n_fft"],
                self.mp.param["mid_side"],
                self.mp.param["mid_side_b2"],
                self.mp.param["reverse"],
            )
-            # pdb.set_trace()
-            if d == bands_n and self.data["high_end_process"] != "none":
-                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
-                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
-                )
-                input_high_end = X_spec_s[d][
-                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
-                ]

-        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+            # pdb.set_trace()
+
+        input_high_end_h = (
+            self.mp.param["band"][1]["n_fft"] // 2
+            - self.mp.param["band"][1]["crop_stop"]
+        ) + (self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"])
+        input_high_end = x_spec_s[1][
+            :,
+            self.mp.param["band"][1]["n_fft"] // 2
+            - input_high_end_h : self.mp.param["band"][1]["n_fft"] // 2,
+            :,
+        ]
+        x_spec_m = spec_utils.combine_spectrograms(x_spec_s, self.mp)
        aggresive_set = float(self.data["agg"] / 100)
        aggressiveness = {
            "value": aggresive_set,
            "split_bin": self.mp.param["band"][1]["crop_stop"],
        }
        with torch.no_grad():
-            pred, X_mag, X_phase = inference(
-                X_spec_m, self.device, self.model, aggressiveness, self.data
+            pred, x_mag, x_phase = inference(
+                x_spec_m, self.config.device, self.model, aggressiveness, self.data
            )
        # Postprocess
        if self.data["postprocess"]:
-            pred_inv = np.clip(X_mag - pred, 0, np.inf)
+            pred_inv = np.clip(x_mag - pred, 0, np.inf)
            pred = spec_utils.mask_silence(pred, pred_inv)
-        y_spec_m = pred * X_phase
-        v_spec_m = X_spec_m - y_spec_m
+        y_spec_m = pred * x_phase
+        v_spec_m = x_spec_m - y_spec_m

-        if ins_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
-                )
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
-                    y_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
-            logger.info("%s instruments done" % name)
-            if is_hp3:
-                head = "vocal_"
-            else:
-                head = "instrument_"
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        ins_root,
-                        head + f"{name}_{self.data['agg']}.{format}",
-                    ),
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )  #
-            else:
-                path = os.path.join(ins_root, head + f"{name}_{self.data['agg']}.wav")
-                sf.write(
-                    path,
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except Exception:
-                            pass
-        if vocal_root is not None:
-            head = "instrument_" if is_hp3 else "vocal_"
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
-                )
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
-                    v_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
-            logger.info(f"{name} vocals done")
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        vocal_root,
-                        head + f"{name}_{self.data['agg']}.{format}",
-                    ),
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(vocal_root, head + f"{name}_{self.data['agg']}.wav")
-                sf.write(
-                    path,
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + f".{format}"
-                    os.system(f"ffmpeg -i {path} -vn {opt_format_path} -q:a 2 -y")
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
-
-
-class AudioPreDeEcho:
-    def __init__(self, agg, model_path, device, is_half, tta=False):
-        self.model_path = model_path
-        self.device = device
-        self.data = {
-            # Processing Options
-            "postprocess": False,
-            "tta": tta,
-            # Constants
-            "window_size": 512,
-            "agg": agg,
-            "high_end_process": "mirroring",
-        }
-        mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
-        nout = 64 if "DeReverb" in model_path else 48
-        model = CascadedNet(mp.param["bins"] * 2, nout)
-        cpk = torch.load(model_path, map_location="cpu")
-        model.load_state_dict(cpk)
-        model.eval()
-        if is_half:
-            model = model.half().to(device)
-        else:
-            model = model.to(device)
-
-        self.mp = mp
-        self.model = model
-
-    def _path_audio_(
-        self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
-    ):  # 3个VR模型vocal和ins是反的
-        name = os.path.basename(music_file)
-        if (ins_root and vocal_root) is None:
-            return "No save root."
-        else:
-            os.makedirs(ins_root, exist_ok=True)
-            os.makedirs(vocal_root, exist_ok=True)
-
-        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
-        bands_n = len(self.mp.param["band"])
-        # print(bands_n)
-        for d in range(bands_n, 0, -1):
-            bp = self.mp.param["band"][d]
-            if d == bands_n:  # high-end band
-                # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
-                (
-                    X_wave[d],
-                    _,
-                ) = librosa.core.load(
-                    music_file,
-                    bp["sr"],
-                    False,
-                    dtype=np.float32,
-                    res_type=bp["res_type"],
-                )
-                if X_wave[d].ndim == 1:
-                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
-            else:  # lower bands
-                X_wave[d] = librosa.core.resample(
-                    X_wave[d + 1],
-                    self.mp.param["band"][d + 1]["sr"],
-                    bp["sr"],
-                    res_type=bp["res_type"],
-                )
-            # Stft of wave source
-            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
-                X_wave[d],
-                bp["hl"],
-                bp["n_fft"],
-                self.mp.param["mid_side"],
-                self.mp.param["mid_side_b2"],
-                self.mp.param["reverse"],
+        if self.data["high_end_process"].startswith("mirroring"):
+            input_high_end_ = spec_utils.mirroring(
+                self.data["high_end_process"], y_spec_m, input_high_end, self.mp
            )
-            # pdb.set_trace()
-            if d == bands_n and self.data["high_end_process"] != "none":
-                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
-                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
-                )
-                input_high_end = X_spec_s[d][
-                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
-                ]
-
-        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
-        aggresive_set = float(self.data["agg"] / 100)
-        aggressiveness = {
-            "value": aggresive_set,
-            "split_bin": self.mp.param["band"][1]["crop_stop"],
-        }
-        with torch.no_grad():
-            pred, X_mag, X_phase = inference(
-                X_spec_m, self.device, self.model, aggressiveness, self.data
+            wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+                y_spec_m, self.mp, input_high_end_h, input_high_end_
            )
-        # Postprocess
-        if self.data["postprocess"]:
-            pred_inv = np.clip(X_mag - pred, 0, np.inf)
-            pred = spec_utils.mask_silence(pred, pred_inv)
-        y_spec_m = pred * X_phase
-        v_spec_m = X_spec_m - y_spec_m
+            input_high_end_ = spec_utils.mirroring(
+                self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+            )
+            wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+                v_spec_m, self.mp, input_high_end_h, input_high_end_
+            )
+        else:
+            wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+            wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)

-        if ins_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
-                )
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
-                    y_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
-            logger.info("%s instruments done" % name)
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        ins_root,
-                        "instrument_{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )  #
-            else:
-                path = os.path.join(
-                    ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
-        if vocal_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
-                )
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
-                    v_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
-            logger.info("%s vocals done" % name)
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        vocal_root,
-                        "vocal_{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(
-                    vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
+        return (
+            (np.array(wav_instrument) * 32768).astype("int16"),
+            (np.array(wav_vocals) * 32768).astype("int16"),
+            self.mp.param["sr"],
+            self.data["agg"],
+        )