add endpoint for Qwen3-TTS -> RVC
This commit is contained in:
0
assets-download.sh
Normal file → Executable file
0
assets-download.sh
Normal file → Executable file
2437
poetry.lock
generated
2437
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,46 +1,81 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 1e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 4,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 12800,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 32000,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 320,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [10,4,2,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4,4],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 0.0001,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.99
|
||||
],
|
||||
"eps": 1e-09,
|
||||
"batch_size": 4,
|
||||
"fp16_run": false,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 12800,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 32000,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 320,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [
|
||||
3,
|
||||
7,
|
||||
11
|
||||
],
|
||||
"resblock_dilation_sizes": [
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"upsample_rates": [
|
||||
10,
|
||||
4,
|
||||
2,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [
|
||||
16,
|
||||
16,
|
||||
4,
|
||||
4,
|
||||
4
|
||||
],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
@@ -1,46 +1,79 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 1e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 4,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 12800,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 40000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 400,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 125,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [10,10,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 0.0001,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.99
|
||||
],
|
||||
"eps": 1e-09,
|
||||
"batch_size": 4,
|
||||
"fp16_run": false,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 12800,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 40000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 400,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 125,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [
|
||||
3,
|
||||
7,
|
||||
11
|
||||
],
|
||||
"resblock_dilation_sizes": [
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"upsample_rates": [
|
||||
10,
|
||||
10,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [
|
||||
16,
|
||||
16,
|
||||
4,
|
||||
4
|
||||
],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
@@ -1,46 +1,81 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 1e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 4,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 11520,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 48000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 480,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 128,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [10,6,2,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4,4],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 0.0001,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.99
|
||||
],
|
||||
"eps": 1e-09,
|
||||
"batch_size": 4,
|
||||
"fp16_run": false,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 11520,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 48000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 480,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 128,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [
|
||||
3,
|
||||
7,
|
||||
11
|
||||
],
|
||||
"resblock_dilation_sizes": [
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"upsample_rates": [
|
||||
10,
|
||||
6,
|
||||
2,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [
|
||||
16,
|
||||
16,
|
||||
4,
|
||||
4,
|
||||
4
|
||||
],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
@@ -1,46 +1,79 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 1e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 4,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 12800,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 32000,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 320,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [10,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [20,16,4,4],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 0.0001,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.99
|
||||
],
|
||||
"eps": 1e-09,
|
||||
"batch_size": 4,
|
||||
"fp16_run": false,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 12800,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 32000,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 320,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [
|
||||
3,
|
||||
7,
|
||||
11
|
||||
],
|
||||
"resblock_dilation_sizes": [
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"upsample_rates": [
|
||||
10,
|
||||
8,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [
|
||||
20,
|
||||
16,
|
||||
4,
|
||||
4
|
||||
],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
@@ -1,46 +1,79 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 1e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 4,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 17280,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 48000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 480,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 128,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [12,10,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [24,20,4,4],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 0.0001,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.99
|
||||
],
|
||||
"eps": 1e-09,
|
||||
"batch_size": 4,
|
||||
"fp16_run": false,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 17280,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 48000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 480,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 128,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [
|
||||
3,
|
||||
7,
|
||||
11
|
||||
],
|
||||
"resblock_dilation_sizes": [
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"upsample_rates": [
|
||||
12,
|
||||
10,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [
|
||||
24,
|
||||
20,
|
||||
4,
|
||||
4
|
||||
],
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"spk_embed_dim": 109
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ from tqdm import tqdm
|
||||
|
||||
|
||||
def load_inputs(path, device, is_half=False):
|
||||
parm = torch.load(path, map_location=torch.device("cpu"))
|
||||
parm = torch.load(path, map_location=torch.device("cpu"), weights_only=False)
|
||||
for key in parm.keys():
|
||||
parm[key] = parm[key].to(device)
|
||||
if is_half and parm[key].dtype == torch.float32:
|
||||
|
||||
@@ -5,7 +5,7 @@ def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
|
||||
from infer.lib.rmvpe import E2E
|
||||
|
||||
model = E2E(4, 1, (2, 2))
|
||||
ckpt = torch.load(model_path, map_location=device)
|
||||
ckpt = torch.load(model_path, map_location=device, weights_only=False)
|
||||
model.load_state_dict(ckpt)
|
||||
model.eval()
|
||||
model = model.to(device)
|
||||
|
||||
@@ -9,7 +9,7 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
|
||||
SynthesizerTrnMs768NSFsid_nono,
|
||||
)
|
||||
|
||||
cpt = torch.load(pth_path, map_location=torch.device("cpu"))
|
||||
cpt = torch.load(pth_path, map_location=torch.device("cpu"), weights_only=False)
|
||||
# tgt_sr = cpt["config"][-1]
|
||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
|
||||
if_f0 = cpt.get("f0", 1)
|
||||
|
||||
1330
rvc/lib/rmvpe.py
1330
rvc/lib/rmvpe.py
File diff suppressed because it is too large
Load Diff
@@ -39,7 +39,7 @@ class AudioPreprocess:
|
||||
else CascadedNet(
|
||||
self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
|
||||
)
|
||||
.load_state_dict(torch.load(model_path, map_location="cpu"))
|
||||
.load_state_dict(torch.load(model_path, map_location="cpu", weights_only=False))
|
||||
.eval()
|
||||
)
|
||||
if self.config.is_half:
|
||||
|
||||
@@ -120,10 +120,10 @@ class VC:
|
||||
raise FileNotFoundError("hubert_path not found.")
|
||||
|
||||
if hasattr(input_audio_path, "name"):
|
||||
input_audio_path = input_audio_path.name
|
||||
input_audio_path = str(input_audio_path)
|
||||
elif not isinstance(input_audio_path, str):
|
||||
raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}")
|
||||
|
||||
|
||||
if not os.path.exists(input_audio_path):
|
||||
raise FileNotFoundError("input_audio_path not found.")
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import torch
|
||||
|
||||
from fairseq import checkpoint_utils
|
||||
|
||||
@@ -20,10 +21,23 @@ def get_index_path_from_model(sid):
|
||||
|
||||
|
||||
def load_hubert(config, hubert_path: str):
|
||||
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[hubert_path],
|
||||
suffix="",
|
||||
)
|
||||
# PyTorch 2.6+ changed weights_only default to True, which breaks fairseq checkpoints
|
||||
# Monkey-patch torch.load to use weights_only=False for fairseq
|
||||
original_torch_load = torch.load
|
||||
|
||||
def patched_torch_load(f, map_location=None, *args, **kwargs):
|
||||
kwargs.setdefault('weights_only', False)
|
||||
return original_torch_load(f, map_location=map_location, *args, **kwargs)
|
||||
|
||||
torch.load = patched_torch_load
|
||||
try:
|
||||
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[hubert_path],
|
||||
suffix="",
|
||||
)
|
||||
finally:
|
||||
torch.load = original_torch_load
|
||||
|
||||
hubert_model = models[0]
|
||||
hubert_model = hubert_model.to(config.device)
|
||||
hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import tempfile
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
@@ -11,6 +13,9 @@ from base64 import b64encode
|
||||
from rvc.modules.vc.modules import VC
|
||||
import glob
|
||||
import os
|
||||
import soundfile as sf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
from dotenv import load_dotenv
|
||||
@@ -74,3 +79,114 @@ def inference(
|
||||
"audio": b64encode(wv.read()).decode("utf-8"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@router.post("/tts-inference")
|
||||
def tts_inference(
|
||||
text: str = Body(..., description="The text to synthesize"),
|
||||
language: str = Body(
|
||||
"Chinese",
|
||||
description="Language code",
|
||||
enum=[
|
||||
"Chinese",
|
||||
"English",
|
||||
"Japanese",
|
||||
"Korean",
|
||||
"German",
|
||||
"French",
|
||||
"Russian",
|
||||
"Portuguese",
|
||||
"Spanish",
|
||||
"Italian",
|
||||
],
|
||||
),
|
||||
speaker: str = Body("Vivian", description="Speaker/voice profile name"),
|
||||
instruct: str = Body("", description="Natural language instruction for controlling timbre, emotion, and prosody"),
|
||||
modelpath: Path
|
||||
| UploadFile = Body(
|
||||
...,
|
||||
enum=[
|
||||
os.path.basename(file)
|
||||
for file in glob.glob(f"{os.getenv('weight_root')}/*")
|
||||
],
|
||||
),
|
||||
res_type: str = Query("blob", enum=["blob", "json"]),
|
||||
sid: int = 0,
|
||||
f0_up_key: int = 0,
|
||||
f0_method: str = Query(
|
||||
"rmvpe", enum=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"]
|
||||
),
|
||||
f0_file: Path | None = None,
|
||||
index_file: Path | None = None,
|
||||
index_rate: float = 0.75,
|
||||
filter_radius: int = 3,
|
||||
resample_sr: int = 0,
|
||||
rms_mix_rate: float = 0.25,
|
||||
protect: float = 0.33,
|
||||
):
|
||||
"""
|
||||
Perform TTS using Qwen3-TTS followed by voice conversion inference.
|
||||
|
||||
First generates speech from text using Qwen3-TTS, then applies voice conversion
|
||||
to transform the generated speech to the target voice.
|
||||
"""
|
||||
from qwen_tts import Qwen3TTSModel
|
||||
import torch
|
||||
|
||||
# Load Qwen3-TTS model
|
||||
tts_model = Qwen3TTSModel.from_pretrained(
|
||||
"Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
|
||||
device_map="cuda:0" if torch.cuda.is_available() else "cpu",
|
||||
dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
|
||||
attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
|
||||
)
|
||||
|
||||
# Generate TTS audio
|
||||
wavs, sr = tts_model.generate_custom_voice(
|
||||
text=text,
|
||||
language=language,
|
||||
speaker=speaker,
|
||||
instruct=instruct,
|
||||
)
|
||||
|
||||
# Save TTS output to temporary file
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
tmp_path = tmp.name
|
||||
sf.write(tmp_path, wavs[0], sr)
|
||||
tmp.close()
|
||||
|
||||
try:
|
||||
# Run voice conversion on the generated audio
|
||||
vc = VC()
|
||||
vc.get_vc(modelpath)
|
||||
tgt_sr, audio_opt, times, _ = vc.vc_inference(
|
||||
sid,
|
||||
tmp_path,
|
||||
f0_up_key,
|
||||
f0_method,
|
||||
f0_file,
|
||||
index_file,
|
||||
index_rate,
|
||||
filter_radius,
|
||||
resample_sr,
|
||||
rms_mix_rate,
|
||||
protect,
|
||||
)
|
||||
wavfile.write(wv := BytesIO(), tgt_sr, audio_opt)
|
||||
print(times)
|
||||
if res_type == "blob":
|
||||
return responses.StreamingResponse(
|
||||
wv,
|
||||
media_type="audio/wav",
|
||||
headers={"Content-Disposition": "attachment; filename=tts_inference.wav"},
|
||||
)
|
||||
else:
|
||||
return JSONResponse(
|
||||
{
|
||||
"time": json.loads(json.dumps(times)),
|
||||
"audio": b64encode(wv.read()).decode("utf-8"),
|
||||
}
|
||||
)
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
os.unlink(tmp_path)
|
||||
|
||||
Reference in New Issue
Block a user