add endpoint for Qwen3-TTS -> RVC
This commit is contained in:
0
assets-download.sh
Normal file → Executable file
0
assets-download.sh
Normal file → Executable file
2437
poetry.lock
generated
2437
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,46 +1,81 @@
|
|||||||
{
|
{
|
||||||
"train": {
|
"train": {
|
||||||
"log_interval": 200,
|
"log_interval": 200,
|
||||||
"seed": 1234,
|
"seed": 1234,
|
||||||
"epochs": 20000,
|
"epochs": 20000,
|
||||||
"learning_rate": 1e-4,
|
"learning_rate": 0.0001,
|
||||||
"betas": [0.8, 0.99],
|
"betas": [
|
||||||
"eps": 1e-9,
|
0.8,
|
||||||
"batch_size": 4,
|
0.99
|
||||||
"fp16_run": true,
|
],
|
||||||
"lr_decay": 0.999875,
|
"eps": 1e-09,
|
||||||
"segment_size": 12800,
|
"batch_size": 4,
|
||||||
"init_lr_ratio": 1,
|
"fp16_run": false,
|
||||||
"warmup_epochs": 0,
|
"lr_decay": 0.999875,
|
||||||
"c_mel": 45,
|
"segment_size": 12800,
|
||||||
"c_kl": 1.0
|
"init_lr_ratio": 1,
|
||||||
},
|
"warmup_epochs": 0,
|
||||||
"data": {
|
"c_mel": 45,
|
||||||
"max_wav_value": 32768.0,
|
"c_kl": 1.0
|
||||||
"sampling_rate": 32000,
|
},
|
||||||
"filter_length": 1024,
|
"data": {
|
||||||
"hop_length": 320,
|
"max_wav_value": 32768.0,
|
||||||
"win_length": 1024,
|
"sampling_rate": 32000,
|
||||||
"n_mel_channels": 80,
|
"filter_length": 1024,
|
||||||
"mel_fmin": 0.0,
|
"hop_length": 320,
|
||||||
"mel_fmax": null
|
"win_length": 1024,
|
||||||
},
|
"n_mel_channels": 80,
|
||||||
"model": {
|
"mel_fmin": 0.0,
|
||||||
"inter_channels": 192,
|
"mel_fmax": null
|
||||||
"hidden_channels": 192,
|
},
|
||||||
"filter_channels": 768,
|
"model": {
|
||||||
"n_heads": 2,
|
"inter_channels": 192,
|
||||||
"n_layers": 6,
|
"hidden_channels": 192,
|
||||||
"kernel_size": 3,
|
"filter_channels": 768,
|
||||||
"p_dropout": 0,
|
"n_heads": 2,
|
||||||
"resblock": "1",
|
"n_layers": 6,
|
||||||
"resblock_kernel_sizes": [3,7,11],
|
"kernel_size": 3,
|
||||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
"p_dropout": 0,
|
||||||
"upsample_rates": [10,4,2,2,2],
|
"resblock": "1",
|
||||||
"upsample_initial_channel": 512,
|
"resblock_kernel_sizes": [
|
||||||
"upsample_kernel_sizes": [16,16,4,4,4],
|
3,
|
||||||
"use_spectral_norm": false,
|
7,
|
||||||
"gin_channels": 256,
|
11
|
||||||
"spk_embed_dim": 109
|
],
|
||||||
}
|
"resblock_dilation_sizes": [
|
||||||
}
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"upsample_rates": [
|
||||||
|
10,
|
||||||
|
4,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"upsample_kernel_sizes": [
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
4
|
||||||
|
],
|
||||||
|
"use_spectral_norm": false,
|
||||||
|
"gin_channels": 256,
|
||||||
|
"spk_embed_dim": 109
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,46 +1,79 @@
|
|||||||
{
|
{
|
||||||
"train": {
|
"train": {
|
||||||
"log_interval": 200,
|
"log_interval": 200,
|
||||||
"seed": 1234,
|
"seed": 1234,
|
||||||
"epochs": 20000,
|
"epochs": 20000,
|
||||||
"learning_rate": 1e-4,
|
"learning_rate": 0.0001,
|
||||||
"betas": [0.8, 0.99],
|
"betas": [
|
||||||
"eps": 1e-9,
|
0.8,
|
||||||
"batch_size": 4,
|
0.99
|
||||||
"fp16_run": true,
|
],
|
||||||
"lr_decay": 0.999875,
|
"eps": 1e-09,
|
||||||
"segment_size": 12800,
|
"batch_size": 4,
|
||||||
"init_lr_ratio": 1,
|
"fp16_run": false,
|
||||||
"warmup_epochs": 0,
|
"lr_decay": 0.999875,
|
||||||
"c_mel": 45,
|
"segment_size": 12800,
|
||||||
"c_kl": 1.0
|
"init_lr_ratio": 1,
|
||||||
},
|
"warmup_epochs": 0,
|
||||||
"data": {
|
"c_mel": 45,
|
||||||
"max_wav_value": 32768.0,
|
"c_kl": 1.0
|
||||||
"sampling_rate": 40000,
|
},
|
||||||
"filter_length": 2048,
|
"data": {
|
||||||
"hop_length": 400,
|
"max_wav_value": 32768.0,
|
||||||
"win_length": 2048,
|
"sampling_rate": 40000,
|
||||||
"n_mel_channels": 125,
|
"filter_length": 2048,
|
||||||
"mel_fmin": 0.0,
|
"hop_length": 400,
|
||||||
"mel_fmax": null
|
"win_length": 2048,
|
||||||
},
|
"n_mel_channels": 125,
|
||||||
"model": {
|
"mel_fmin": 0.0,
|
||||||
"inter_channels": 192,
|
"mel_fmax": null
|
||||||
"hidden_channels": 192,
|
},
|
||||||
"filter_channels": 768,
|
"model": {
|
||||||
"n_heads": 2,
|
"inter_channels": 192,
|
||||||
"n_layers": 6,
|
"hidden_channels": 192,
|
||||||
"kernel_size": 3,
|
"filter_channels": 768,
|
||||||
"p_dropout": 0,
|
"n_heads": 2,
|
||||||
"resblock": "1",
|
"n_layers": 6,
|
||||||
"resblock_kernel_sizes": [3,7,11],
|
"kernel_size": 3,
|
||||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
"p_dropout": 0,
|
||||||
"upsample_rates": [10,10,2,2],
|
"resblock": "1",
|
||||||
"upsample_initial_channel": 512,
|
"resblock_kernel_sizes": [
|
||||||
"upsample_kernel_sizes": [16,16,4,4],
|
3,
|
||||||
"use_spectral_norm": false,
|
7,
|
||||||
"gin_channels": 256,
|
11
|
||||||
"spk_embed_dim": 109
|
],
|
||||||
}
|
"resblock_dilation_sizes": [
|
||||||
}
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"upsample_rates": [
|
||||||
|
10,
|
||||||
|
10,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"upsample_kernel_sizes": [
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4
|
||||||
|
],
|
||||||
|
"use_spectral_norm": false,
|
||||||
|
"gin_channels": 256,
|
||||||
|
"spk_embed_dim": 109
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,46 +1,81 @@
|
|||||||
{
|
{
|
||||||
"train": {
|
"train": {
|
||||||
"log_interval": 200,
|
"log_interval": 200,
|
||||||
"seed": 1234,
|
"seed": 1234,
|
||||||
"epochs": 20000,
|
"epochs": 20000,
|
||||||
"learning_rate": 1e-4,
|
"learning_rate": 0.0001,
|
||||||
"betas": [0.8, 0.99],
|
"betas": [
|
||||||
"eps": 1e-9,
|
0.8,
|
||||||
"batch_size": 4,
|
0.99
|
||||||
"fp16_run": true,
|
],
|
||||||
"lr_decay": 0.999875,
|
"eps": 1e-09,
|
||||||
"segment_size": 11520,
|
"batch_size": 4,
|
||||||
"init_lr_ratio": 1,
|
"fp16_run": false,
|
||||||
"warmup_epochs": 0,
|
"lr_decay": 0.999875,
|
||||||
"c_mel": 45,
|
"segment_size": 11520,
|
||||||
"c_kl": 1.0
|
"init_lr_ratio": 1,
|
||||||
},
|
"warmup_epochs": 0,
|
||||||
"data": {
|
"c_mel": 45,
|
||||||
"max_wav_value": 32768.0,
|
"c_kl": 1.0
|
||||||
"sampling_rate": 48000,
|
},
|
||||||
"filter_length": 2048,
|
"data": {
|
||||||
"hop_length": 480,
|
"max_wav_value": 32768.0,
|
||||||
"win_length": 2048,
|
"sampling_rate": 48000,
|
||||||
"n_mel_channels": 128,
|
"filter_length": 2048,
|
||||||
"mel_fmin": 0.0,
|
"hop_length": 480,
|
||||||
"mel_fmax": null
|
"win_length": 2048,
|
||||||
},
|
"n_mel_channels": 128,
|
||||||
"model": {
|
"mel_fmin": 0.0,
|
||||||
"inter_channels": 192,
|
"mel_fmax": null
|
||||||
"hidden_channels": 192,
|
},
|
||||||
"filter_channels": 768,
|
"model": {
|
||||||
"n_heads": 2,
|
"inter_channels": 192,
|
||||||
"n_layers": 6,
|
"hidden_channels": 192,
|
||||||
"kernel_size": 3,
|
"filter_channels": 768,
|
||||||
"p_dropout": 0,
|
"n_heads": 2,
|
||||||
"resblock": "1",
|
"n_layers": 6,
|
||||||
"resblock_kernel_sizes": [3,7,11],
|
"kernel_size": 3,
|
||||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
"p_dropout": 0,
|
||||||
"upsample_rates": [10,6,2,2,2],
|
"resblock": "1",
|
||||||
"upsample_initial_channel": 512,
|
"resblock_kernel_sizes": [
|
||||||
"upsample_kernel_sizes": [16,16,4,4,4],
|
3,
|
||||||
"use_spectral_norm": false,
|
7,
|
||||||
"gin_channels": 256,
|
11
|
||||||
"spk_embed_dim": 109
|
],
|
||||||
}
|
"resblock_dilation_sizes": [
|
||||||
}
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"upsample_rates": [
|
||||||
|
10,
|
||||||
|
6,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"upsample_kernel_sizes": [
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
4
|
||||||
|
],
|
||||||
|
"use_spectral_norm": false,
|
||||||
|
"gin_channels": 256,
|
||||||
|
"spk_embed_dim": 109
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,46 +1,79 @@
|
|||||||
{
|
{
|
||||||
"train": {
|
"train": {
|
||||||
"log_interval": 200,
|
"log_interval": 200,
|
||||||
"seed": 1234,
|
"seed": 1234,
|
||||||
"epochs": 20000,
|
"epochs": 20000,
|
||||||
"learning_rate": 1e-4,
|
"learning_rate": 0.0001,
|
||||||
"betas": [0.8, 0.99],
|
"betas": [
|
||||||
"eps": 1e-9,
|
0.8,
|
||||||
"batch_size": 4,
|
0.99
|
||||||
"fp16_run": true,
|
],
|
||||||
"lr_decay": 0.999875,
|
"eps": 1e-09,
|
||||||
"segment_size": 12800,
|
"batch_size": 4,
|
||||||
"init_lr_ratio": 1,
|
"fp16_run": false,
|
||||||
"warmup_epochs": 0,
|
"lr_decay": 0.999875,
|
||||||
"c_mel": 45,
|
"segment_size": 12800,
|
||||||
"c_kl": 1.0
|
"init_lr_ratio": 1,
|
||||||
},
|
"warmup_epochs": 0,
|
||||||
"data": {
|
"c_mel": 45,
|
||||||
"max_wav_value": 32768.0,
|
"c_kl": 1.0
|
||||||
"sampling_rate": 32000,
|
},
|
||||||
"filter_length": 1024,
|
"data": {
|
||||||
"hop_length": 320,
|
"max_wav_value": 32768.0,
|
||||||
"win_length": 1024,
|
"sampling_rate": 32000,
|
||||||
"n_mel_channels": 80,
|
"filter_length": 1024,
|
||||||
"mel_fmin": 0.0,
|
"hop_length": 320,
|
||||||
"mel_fmax": null
|
"win_length": 1024,
|
||||||
},
|
"n_mel_channels": 80,
|
||||||
"model": {
|
"mel_fmin": 0.0,
|
||||||
"inter_channels": 192,
|
"mel_fmax": null
|
||||||
"hidden_channels": 192,
|
},
|
||||||
"filter_channels": 768,
|
"model": {
|
||||||
"n_heads": 2,
|
"inter_channels": 192,
|
||||||
"n_layers": 6,
|
"hidden_channels": 192,
|
||||||
"kernel_size": 3,
|
"filter_channels": 768,
|
||||||
"p_dropout": 0,
|
"n_heads": 2,
|
||||||
"resblock": "1",
|
"n_layers": 6,
|
||||||
"resblock_kernel_sizes": [3,7,11],
|
"kernel_size": 3,
|
||||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
"p_dropout": 0,
|
||||||
"upsample_rates": [10,8,2,2],
|
"resblock": "1",
|
||||||
"upsample_initial_channel": 512,
|
"resblock_kernel_sizes": [
|
||||||
"upsample_kernel_sizes": [20,16,4,4],
|
3,
|
||||||
"use_spectral_norm": false,
|
7,
|
||||||
"gin_channels": 256,
|
11
|
||||||
"spk_embed_dim": 109
|
],
|
||||||
}
|
"resblock_dilation_sizes": [
|
||||||
}
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"upsample_rates": [
|
||||||
|
10,
|
||||||
|
8,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"upsample_kernel_sizes": [
|
||||||
|
20,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4
|
||||||
|
],
|
||||||
|
"use_spectral_norm": false,
|
||||||
|
"gin_channels": 256,
|
||||||
|
"spk_embed_dim": 109
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,46 +1,79 @@
|
|||||||
{
|
{
|
||||||
"train": {
|
"train": {
|
||||||
"log_interval": 200,
|
"log_interval": 200,
|
||||||
"seed": 1234,
|
"seed": 1234,
|
||||||
"epochs": 20000,
|
"epochs": 20000,
|
||||||
"learning_rate": 1e-4,
|
"learning_rate": 0.0001,
|
||||||
"betas": [0.8, 0.99],
|
"betas": [
|
||||||
"eps": 1e-9,
|
0.8,
|
||||||
"batch_size": 4,
|
0.99
|
||||||
"fp16_run": true,
|
],
|
||||||
"lr_decay": 0.999875,
|
"eps": 1e-09,
|
||||||
"segment_size": 17280,
|
"batch_size": 4,
|
||||||
"init_lr_ratio": 1,
|
"fp16_run": false,
|
||||||
"warmup_epochs": 0,
|
"lr_decay": 0.999875,
|
||||||
"c_mel": 45,
|
"segment_size": 17280,
|
||||||
"c_kl": 1.0
|
"init_lr_ratio": 1,
|
||||||
},
|
"warmup_epochs": 0,
|
||||||
"data": {
|
"c_mel": 45,
|
||||||
"max_wav_value": 32768.0,
|
"c_kl": 1.0
|
||||||
"sampling_rate": 48000,
|
},
|
||||||
"filter_length": 2048,
|
"data": {
|
||||||
"hop_length": 480,
|
"max_wav_value": 32768.0,
|
||||||
"win_length": 2048,
|
"sampling_rate": 48000,
|
||||||
"n_mel_channels": 128,
|
"filter_length": 2048,
|
||||||
"mel_fmin": 0.0,
|
"hop_length": 480,
|
||||||
"mel_fmax": null
|
"win_length": 2048,
|
||||||
},
|
"n_mel_channels": 128,
|
||||||
"model": {
|
"mel_fmin": 0.0,
|
||||||
"inter_channels": 192,
|
"mel_fmax": null
|
||||||
"hidden_channels": 192,
|
},
|
||||||
"filter_channels": 768,
|
"model": {
|
||||||
"n_heads": 2,
|
"inter_channels": 192,
|
||||||
"n_layers": 6,
|
"hidden_channels": 192,
|
||||||
"kernel_size": 3,
|
"filter_channels": 768,
|
||||||
"p_dropout": 0,
|
"n_heads": 2,
|
||||||
"resblock": "1",
|
"n_layers": 6,
|
||||||
"resblock_kernel_sizes": [3,7,11],
|
"kernel_size": 3,
|
||||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
"p_dropout": 0,
|
||||||
"upsample_rates": [12,10,2,2],
|
"resblock": "1",
|
||||||
"upsample_initial_channel": 512,
|
"resblock_kernel_sizes": [
|
||||||
"upsample_kernel_sizes": [24,20,4,4],
|
3,
|
||||||
"use_spectral_norm": false,
|
7,
|
||||||
"gin_channels": 256,
|
11
|
||||||
"spk_embed_dim": 109
|
],
|
||||||
}
|
"resblock_dilation_sizes": [
|
||||||
}
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
5
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"upsample_rates": [
|
||||||
|
12,
|
||||||
|
10,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"upsample_kernel_sizes": [
|
||||||
|
24,
|
||||||
|
20,
|
||||||
|
4,
|
||||||
|
4
|
||||||
|
],
|
||||||
|
"use_spectral_norm": false,
|
||||||
|
"gin_channels": 256,
|
||||||
|
"spk_embed_dim": 109
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -8,7 +8,7 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
|
|
||||||
def load_inputs(path, device, is_half=False):
|
def load_inputs(path, device, is_half=False):
|
||||||
parm = torch.load(path, map_location=torch.device("cpu"))
|
parm = torch.load(path, map_location=torch.device("cpu"), weights_only=False)
|
||||||
for key in parm.keys():
|
for key in parm.keys():
|
||||||
parm[key] = parm[key].to(device)
|
parm[key] = parm[key].to(device)
|
||||||
if is_half and parm[key].dtype == torch.float32:
|
if is_half and parm[key].dtype == torch.float32:
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
|
|||||||
from infer.lib.rmvpe import E2E
|
from infer.lib.rmvpe import E2E
|
||||||
|
|
||||||
model = E2E(4, 1, (2, 2))
|
model = E2E(4, 1, (2, 2))
|
||||||
ckpt = torch.load(model_path, map_location=device)
|
ckpt = torch.load(model_path, map_location=device, weights_only=False)
|
||||||
model.load_state_dict(ckpt)
|
model.load_state_dict(ckpt)
|
||||||
model.eval()
|
model.eval()
|
||||||
model = model.to(device)
|
model = model.to(device)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
|
|||||||
SynthesizerTrnMs768NSFsid_nono,
|
SynthesizerTrnMs768NSFsid_nono,
|
||||||
)
|
)
|
||||||
|
|
||||||
cpt = torch.load(pth_path, map_location=torch.device("cpu"))
|
cpt = torch.load(pth_path, map_location=torch.device("cpu"), weights_only=False)
|
||||||
# tgt_sr = cpt["config"][-1]
|
# tgt_sr = cpt["config"][-1]
|
||||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
|
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
|
||||||
if_f0 = cpt.get("f0", 1)
|
if_f0 = cpt.get("f0", 1)
|
||||||
|
|||||||
1330
rvc/lib/rmvpe.py
1330
rvc/lib/rmvpe.py
File diff suppressed because it is too large
Load Diff
@@ -39,7 +39,7 @@ class AudioPreprocess:
|
|||||||
else CascadedNet(
|
else CascadedNet(
|
||||||
self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
|
self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
|
||||||
)
|
)
|
||||||
.load_state_dict(torch.load(model_path, map_location="cpu"))
|
.load_state_dict(torch.load(model_path, map_location="cpu", weights_only=False))
|
||||||
.eval()
|
.eval()
|
||||||
)
|
)
|
||||||
if self.config.is_half:
|
if self.config.is_half:
|
||||||
|
|||||||
@@ -120,10 +120,10 @@ class VC:
|
|||||||
raise FileNotFoundError("hubert_path not found.")
|
raise FileNotFoundError("hubert_path not found.")
|
||||||
|
|
||||||
if hasattr(input_audio_path, "name"):
|
if hasattr(input_audio_path, "name"):
|
||||||
input_audio_path = input_audio_path.name
|
input_audio_path = str(input_audio_path)
|
||||||
elif not isinstance(input_audio_path, str):
|
elif not isinstance(input_audio_path, str):
|
||||||
raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}")
|
raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}")
|
||||||
|
|
||||||
if not os.path.exists(input_audio_path):
|
if not os.path.exists(input_audio_path):
|
||||||
raise FileNotFoundError("input_audio_path not found.")
|
raise FileNotFoundError("input_audio_path not found.")
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import torch
|
||||||
|
|
||||||
from fairseq import checkpoint_utils
|
from fairseq import checkpoint_utils
|
||||||
|
|
||||||
@@ -20,10 +21,23 @@ def get_index_path_from_model(sid):
|
|||||||
|
|
||||||
|
|
||||||
def load_hubert(config, hubert_path: str):
|
def load_hubert(config, hubert_path: str):
|
||||||
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
# PyTorch 2.6+ changed weights_only default to True, which breaks fairseq checkpoints
|
||||||
[hubert_path],
|
# Monkey-patch torch.load to use weights_only=False for fairseq
|
||||||
suffix="",
|
original_torch_load = torch.load
|
||||||
)
|
|
||||||
|
def patched_torch_load(f, map_location=None, *args, **kwargs):
|
||||||
|
kwargs.setdefault('weights_only', False)
|
||||||
|
return original_torch_load(f, map_location=map_location, *args, **kwargs)
|
||||||
|
|
||||||
|
torch.load = patched_torch_load
|
||||||
|
try:
|
||||||
|
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
||||||
|
[hubert_path],
|
||||||
|
suffix="",
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
torch.load = original_torch_load
|
||||||
|
|
||||||
hubert_model = models[0]
|
hubert_model = models[0]
|
||||||
hubert_model = hubert_model.to(config.device)
|
hubert_model = hubert_model.to(config.device)
|
||||||
hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
|
hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
import tempfile
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -11,6 +13,9 @@ from base64 import b64encode
|
|||||||
from rvc.modules.vc.modules import VC
|
from rvc.modules.vc.modules import VC
|
||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
@@ -74,3 +79,114 @@ def inference(
|
|||||||
"audio": b64encode(wv.read()).decode("utf-8"),
|
"audio": b64encode(wv.read()).decode("utf-8"),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/tts-inference")
|
||||||
|
def tts_inference(
|
||||||
|
text: str = Body(..., description="The text to synthesize"),
|
||||||
|
language: str = Body(
|
||||||
|
"Chinese",
|
||||||
|
description="Language code",
|
||||||
|
enum=[
|
||||||
|
"Chinese",
|
||||||
|
"English",
|
||||||
|
"Japanese",
|
||||||
|
"Korean",
|
||||||
|
"German",
|
||||||
|
"French",
|
||||||
|
"Russian",
|
||||||
|
"Portuguese",
|
||||||
|
"Spanish",
|
||||||
|
"Italian",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
speaker: str = Body("Vivian", description="Speaker/voice profile name"),
|
||||||
|
instruct: str = Body("", description="Natural language instruction for controlling timbre, emotion, and prosody"),
|
||||||
|
modelpath: Path
|
||||||
|
| UploadFile = Body(
|
||||||
|
...,
|
||||||
|
enum=[
|
||||||
|
os.path.basename(file)
|
||||||
|
for file in glob.glob(f"{os.getenv('weight_root')}/*")
|
||||||
|
],
|
||||||
|
),
|
||||||
|
res_type: str = Query("blob", enum=["blob", "json"]),
|
||||||
|
sid: int = 0,
|
||||||
|
f0_up_key: int = 0,
|
||||||
|
f0_method: str = Query(
|
||||||
|
"rmvpe", enum=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"]
|
||||||
|
),
|
||||||
|
f0_file: Path | None = None,
|
||||||
|
index_file: Path | None = None,
|
||||||
|
index_rate: float = 0.75,
|
||||||
|
filter_radius: int = 3,
|
||||||
|
resample_sr: int = 0,
|
||||||
|
rms_mix_rate: float = 0.25,
|
||||||
|
protect: float = 0.33,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Perform TTS using Qwen3-TTS followed by voice conversion inference.
|
||||||
|
|
||||||
|
First generates speech from text using Qwen3-TTS, then applies voice conversion
|
||||||
|
to transform the generated speech to the target voice.
|
||||||
|
"""
|
||||||
|
from qwen_tts import Qwen3TTSModel
|
||||||
|
import torch
|
||||||
|
|
||||||
|
# Load Qwen3-TTS model
|
||||||
|
tts_model = Qwen3TTSModel.from_pretrained(
|
||||||
|
"Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
|
||||||
|
device_map="cuda:0" if torch.cuda.is_available() else "cpu",
|
||||||
|
dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
|
||||||
|
attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate TTS audio
|
||||||
|
wavs, sr = tts_model.generate_custom_voice(
|
||||||
|
text=text,
|
||||||
|
language=language,
|
||||||
|
speaker=speaker,
|
||||||
|
instruct=instruct,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save TTS output to temporary file
|
||||||
|
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||||
|
tmp_path = tmp.name
|
||||||
|
sf.write(tmp_path, wavs[0], sr)
|
||||||
|
tmp.close()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run voice conversion on the generated audio
|
||||||
|
vc = VC()
|
||||||
|
vc.get_vc(modelpath)
|
||||||
|
tgt_sr, audio_opt, times, _ = vc.vc_inference(
|
||||||
|
sid,
|
||||||
|
tmp_path,
|
||||||
|
f0_up_key,
|
||||||
|
f0_method,
|
||||||
|
f0_file,
|
||||||
|
index_file,
|
||||||
|
index_rate,
|
||||||
|
filter_radius,
|
||||||
|
resample_sr,
|
||||||
|
rms_mix_rate,
|
||||||
|
protect,
|
||||||
|
)
|
||||||
|
wavfile.write(wv := BytesIO(), tgt_sr, audio_opt)
|
||||||
|
print(times)
|
||||||
|
if res_type == "blob":
|
||||||
|
return responses.StreamingResponse(
|
||||||
|
wv,
|
||||||
|
media_type="audio/wav",
|
||||||
|
headers={"Content-Disposition": "attachment; filename=tts_inference.wav"},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return JSONResponse(
|
||||||
|
{
|
||||||
|
"time": json.loads(json.dumps(times)),
|
||||||
|
"audio": b64encode(wv.read()).decode("utf-8"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# Clean up temporary file
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|||||||
Reference in New Issue
Block a user