rewrite vr-architecture

This commit is contained in:
Ftps
2024-03-07 21:42:07 +09:00
parent 1b2808e332
commit 0c3c512396
4 changed files with 92 additions and 333 deletions

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "rvc"
version = "0.3.4"
version = "0.3.5"
description = "An easy-to-use Voice Conversion framework based on VITS."
authors = ["Ftps <ftpsflandre@gmail.com>"]
readme = "README.md"

View File

@@ -243,7 +243,7 @@ class MDXNetDereverb:
self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy"
self.shifts = 10 # 'Predict with randomised equivariant stabilisation'
self.mixing = "min_mag" # ['default','min_mag','max_mag']
self.chunks = chunks
self.chunks = chunks # 15
self.margin = 44100
self.dim_t = 9
self.dim_f = 3072

View File

@@ -10,7 +10,7 @@ from pydub import AudioSegment
from rvc.configs.config import Config
from rvc.modules.uvr5.mdxnet import MDXNetDereverb
from rvc.modules.uvr5.vr import AudioPre, AudioPreDeEcho
from rvc.modules.uvr5.vr import AudioPreprocess
logger: logging.Logger = logging.getLogger(__name__)
@@ -23,37 +23,18 @@ class UVR:
def uvr_wrapper(
self,
audio_path: Path,
save_vocal_path: Path | None = None,
save_ins_path: Path | None = None,
agg: int = 10,
export_format: str = "flac",
model_name: str | None = None,
temp_path: Path | None = None,
temp_dir: Path | None = None,
):
infos = []
save_vocal_path = (
os.getenv("save_uvr_path") if not save_vocal_path else save_vocal_path
)
save_ins_path = (
os.getenv("save_uvr_path") if not save_ins_path else save_ins_path
)
infos = list()
if model_name is None:
model_name = os.path.basename(glob(f"{os.getenv('weight_uvr5_root')}/*")[0])
is_hp3 = "HP3" in model_name
if model_name == "onnx_dereverb_By_FoxJoy":
pre_fun = MDXNetDereverb(15, self.config.device)
else:
func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
pre_fun = func(
agg=int(agg),
model_path=os.path.join(
os.getenv("weight_uvr5_root"), model_name # + ".pth"
),
device=self.config.device,
is_half=self.config.is_half,
)
pre_fun = AudioPreprocess(
os.path.join(os.getenv("weight_uvr5_root"), model_name), # + ".pth"
int(agg),
)
process_paths = (
[
@@ -65,12 +46,14 @@ class UVR:
else audio_path
)
results = []
for process_path in [process_paths]:
print(f"path: {process_path}")
info = sf.info(process_path)
if not (info.channels == 2 and info.samplerate == "44100"):
tmp_path = os.path.join(
temp_path or os.environ.get("TEMP"), os.path.basename(process_path)
temp_dir or os.environ.get("TEMP"), os.path.basename(process_path)
)
AudioSegment.from_file(process_path).export(
tmp_path,
@@ -80,14 +63,16 @@ class UVR:
parameters=["-ar", "44100"],
)
pre_fun._path_audio_(
process_path,
save_vocal_path,
save_ins_path,
export_format,
is_hp3=is_hp3,
results.append(
pre_fun.process(
tmp_path or process_path,
)
)
infos.append(f"{os.path.basename(process_path)}->Success")
yield "\n".join(infos)
if torch.cuda.is_available():
torch.cuda.empty_cache()
return results

View File

@@ -6,6 +6,7 @@ import numpy as np
import soundfile as sf
import torch
from rvc.configs.config import Config
from rvc.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
from rvc.lib.uvr5_pack.lib_v5 import spec_utils
from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
@@ -15,10 +16,9 @@ from rvc.lib.uvr5_pack.utils import inference
logger = logging.getLogger(__name__)
class AudioPre:
def __init__(self, agg, model_path, device, is_half, tta=False):
class AudioPreprocess:
def __init__(self, model_path, agg, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
@@ -28,336 +28,110 @@ class AudioPre:
"agg": agg,
"high_end_process": "mirroring",
}
mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.config: Config = Config()
self.version = 3 if "DeEcho" not in self.model_path else 2
self.mp: ModelParameters = ModelParameters(
f"rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v{self.version}.json"
)
self.model = (
Nets.CascadedASPPNet(self.mp.param["bins"] * 2)
if self.version == 3
else CascadedNet(
self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
)
.load_state_dict(torch.load(model_path, map_location="cpu"))
.eval()
)
if self.config.is_half:
self.model = self.model.half()
self.model.to(self.config.device)
self.mp = mp
self.model = model
def _path_audio_(
self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
def process(
self,
music_file,
):
name = os.path.basename(music_file)
if (ins_root and vocal_root) is None:
return "No save root."
else:
os.makedirs(ins_root, exist_ok=True)
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
# librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
(
X_wave[d],
_,
) = librosa.core.load(
x_wave[d] = librosa.core.load(
music_file,
sr=bp["sr"],
mono=False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
)[0]
if x_wave[d].ndim == 1:
x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(
X_wave[d + 1],
x_wave[d] = librosa.core.resample(
x_wave[d + 1],
orig_sr=self.mp.param["band"][d + 1]["sr"],
target_sr=bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
x_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
x_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
# pdb.set_trace()
input_high_end_h = (
self.mp.param["band"][1]["n_fft"] // 2
- self.mp.param["band"][1]["crop_stop"]
) + (self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"])
input_high_end = x_spec_s[1][
:,
self.mp.param["band"][1]["n_fft"] // 2
- input_high_end_h : self.mp.param["band"][1]["n_fft"] // 2,
:,
]
x_spec_m = spec_utils.combine_spectrograms(x_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
pred, x_mag, x_phase = inference(
x_spec_m, self.config.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred_inv = np.clip(x_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
y_spec_m = pred * x_phase
v_spec_m = x_spec_m - y_spec_m
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name)
if is_hp3:
head = "vocal_"
else:
head = "instrument_"
if format in ["wav", "flac"]:
sf.write(
os.path.join(
ins_root,
head + f"{name}_{self.data['agg']}.{format}",
),
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
else:
path = os.path.join(ins_root, head + f"{name}_{self.data['agg']}.wav")
sf.write(
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
if os.path.exists(opt_format_path):
try:
os.remove(path)
except Exception:
pass
if vocal_root is not None:
head = "instrument_" if is_hp3 else "vocal_"
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info(f"{name} vocals done")
if format in ["wav", "flac"]:
sf.write(
os.path.join(
vocal_root,
head + f"{name}_{self.data['agg']}.{format}",
),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
else:
path = os.path.join(vocal_root, head + f"{name}_{self.data['agg']}.wav")
sf.write(
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + f".{format}"
os.system(f"ffmpeg -i {path} -vn {opt_format_path} -q:a 2 -y")
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
class AudioPreDeEcho:
def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
"tta": tta,
# Constants
"window_size": 512,
"agg": agg,
"high_end_process": "mirroring",
}
mp = ModelParameters("rvc/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
nout = 64 if "DeReverb" in model_path else 48
model = CascadedNet(mp.param["bins"] * 2, nout)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(
self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
): # 3个VR模型vocal和ins是反的
name = os.path.basename(music_file)
if (ins_root and vocal_root) is None:
return "No save root."
else:
os.makedirs(ins_root, exist_ok=True)
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
# librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
(
X_wave[d],
_,
) = librosa.core.load(
music_file,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(
X_wave[d + 1],
self.mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
ins_root,
"instrument_{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
else:
path = os.path.join(
ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info("%s vocals done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
vocal_root,
"vocal_{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
else:
path = os.path.join(
vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
return (
(np.array(wav_instrument) * 32768).astype("int16"),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
self.data["agg"],
)