add endpoint for Qwen3-TTS -> RVC

This commit is contained in:
2026-03-01 15:32:48 -08:00
parent 7b284a6346
commit 1f9832ac0d
15 changed files with 1199 additions and 3337 deletions

0
assets-download.sh Normal file → Executable file
View File

2437
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,46 +1,81 @@
{ {
"train": { "train": {
"log_interval": 200, "log_interval": 200,
"seed": 1234, "seed": 1234,
"epochs": 20000, "epochs": 20000,
"learning_rate": 1e-4, "learning_rate": 0.0001,
"betas": [0.8, 0.99], "betas": [
"eps": 1e-9, 0.8,
"batch_size": 4, 0.99
"fp16_run": true, ],
"lr_decay": 0.999875, "eps": 1e-09,
"segment_size": 12800, "batch_size": 4,
"init_lr_ratio": 1, "fp16_run": false,
"warmup_epochs": 0, "lr_decay": 0.999875,
"c_mel": 45, "segment_size": 12800,
"c_kl": 1.0 "init_lr_ratio": 1,
}, "warmup_epochs": 0,
"data": { "c_mel": 45,
"max_wav_value": 32768.0, "c_kl": 1.0
"sampling_rate": 32000, },
"filter_length": 1024, "data": {
"hop_length": 320, "max_wav_value": 32768.0,
"win_length": 1024, "sampling_rate": 32000,
"n_mel_channels": 80, "filter_length": 1024,
"mel_fmin": 0.0, "hop_length": 320,
"mel_fmax": null "win_length": 1024,
}, "n_mel_channels": 80,
"model": { "mel_fmin": 0.0,
"inter_channels": 192, "mel_fmax": null
"hidden_channels": 192, },
"filter_channels": 768, "model": {
"n_heads": 2, "inter_channels": 192,
"n_layers": 6, "hidden_channels": 192,
"kernel_size": 3, "filter_channels": 768,
"p_dropout": 0, "n_heads": 2,
"resblock": "1", "n_layers": 6,
"resblock_kernel_sizes": [3,7,11], "kernel_size": 3,
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "p_dropout": 0,
"upsample_rates": [10,4,2,2,2], "resblock": "1",
"upsample_initial_channel": 512, "resblock_kernel_sizes": [
"upsample_kernel_sizes": [16,16,4,4,4], 3,
"use_spectral_norm": false, 7,
"gin_channels": 256, 11
"spk_embed_dim": 109 ],
} "resblock_dilation_sizes": [
} [
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
4,
2,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,79 @@
{ {
"train": { "train": {
"log_interval": 200, "log_interval": 200,
"seed": 1234, "seed": 1234,
"epochs": 20000, "epochs": 20000,
"learning_rate": 1e-4, "learning_rate": 0.0001,
"betas": [0.8, 0.99], "betas": [
"eps": 1e-9, 0.8,
"batch_size": 4, 0.99
"fp16_run": true, ],
"lr_decay": 0.999875, "eps": 1e-09,
"segment_size": 12800, "batch_size": 4,
"init_lr_ratio": 1, "fp16_run": false,
"warmup_epochs": 0, "lr_decay": 0.999875,
"c_mel": 45, "segment_size": 12800,
"c_kl": 1.0 "init_lr_ratio": 1,
}, "warmup_epochs": 0,
"data": { "c_mel": 45,
"max_wav_value": 32768.0, "c_kl": 1.0
"sampling_rate": 40000, },
"filter_length": 2048, "data": {
"hop_length": 400, "max_wav_value": 32768.0,
"win_length": 2048, "sampling_rate": 40000,
"n_mel_channels": 125, "filter_length": 2048,
"mel_fmin": 0.0, "hop_length": 400,
"mel_fmax": null "win_length": 2048,
}, "n_mel_channels": 125,
"model": { "mel_fmin": 0.0,
"inter_channels": 192, "mel_fmax": null
"hidden_channels": 192, },
"filter_channels": 768, "model": {
"n_heads": 2, "inter_channels": 192,
"n_layers": 6, "hidden_channels": 192,
"kernel_size": 3, "filter_channels": 768,
"p_dropout": 0, "n_heads": 2,
"resblock": "1", "n_layers": 6,
"resblock_kernel_sizes": [3,7,11], "kernel_size": 3,
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "p_dropout": 0,
"upsample_rates": [10,10,2,2], "resblock": "1",
"upsample_initial_channel": 512, "resblock_kernel_sizes": [
"upsample_kernel_sizes": [16,16,4,4], 3,
"use_spectral_norm": false, 7,
"gin_channels": 256, 11
"spk_embed_dim": 109 ],
} "resblock_dilation_sizes": [
} [
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
10,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,81 @@
{ {
"train": { "train": {
"log_interval": 200, "log_interval": 200,
"seed": 1234, "seed": 1234,
"epochs": 20000, "epochs": 20000,
"learning_rate": 1e-4, "learning_rate": 0.0001,
"betas": [0.8, 0.99], "betas": [
"eps": 1e-9, 0.8,
"batch_size": 4, 0.99
"fp16_run": true, ],
"lr_decay": 0.999875, "eps": 1e-09,
"segment_size": 11520, "batch_size": 4,
"init_lr_ratio": 1, "fp16_run": false,
"warmup_epochs": 0, "lr_decay": 0.999875,
"c_mel": 45, "segment_size": 11520,
"c_kl": 1.0 "init_lr_ratio": 1,
}, "warmup_epochs": 0,
"data": { "c_mel": 45,
"max_wav_value": 32768.0, "c_kl": 1.0
"sampling_rate": 48000, },
"filter_length": 2048, "data": {
"hop_length": 480, "max_wav_value": 32768.0,
"win_length": 2048, "sampling_rate": 48000,
"n_mel_channels": 128, "filter_length": 2048,
"mel_fmin": 0.0, "hop_length": 480,
"mel_fmax": null "win_length": 2048,
}, "n_mel_channels": 128,
"model": { "mel_fmin": 0.0,
"inter_channels": 192, "mel_fmax": null
"hidden_channels": 192, },
"filter_channels": 768, "model": {
"n_heads": 2, "inter_channels": 192,
"n_layers": 6, "hidden_channels": 192,
"kernel_size": 3, "filter_channels": 768,
"p_dropout": 0, "n_heads": 2,
"resblock": "1", "n_layers": 6,
"resblock_kernel_sizes": [3,7,11], "kernel_size": 3,
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "p_dropout": 0,
"upsample_rates": [10,6,2,2,2], "resblock": "1",
"upsample_initial_channel": 512, "resblock_kernel_sizes": [
"upsample_kernel_sizes": [16,16,4,4,4], 3,
"use_spectral_norm": false, 7,
"gin_channels": 256, 11
"spk_embed_dim": 109 ],
} "resblock_dilation_sizes": [
} [
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
6,
2,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,79 @@
{ {
"train": { "train": {
"log_interval": 200, "log_interval": 200,
"seed": 1234, "seed": 1234,
"epochs": 20000, "epochs": 20000,
"learning_rate": 1e-4, "learning_rate": 0.0001,
"betas": [0.8, 0.99], "betas": [
"eps": 1e-9, 0.8,
"batch_size": 4, 0.99
"fp16_run": true, ],
"lr_decay": 0.999875, "eps": 1e-09,
"segment_size": 12800, "batch_size": 4,
"init_lr_ratio": 1, "fp16_run": false,
"warmup_epochs": 0, "lr_decay": 0.999875,
"c_mel": 45, "segment_size": 12800,
"c_kl": 1.0 "init_lr_ratio": 1,
}, "warmup_epochs": 0,
"data": { "c_mel": 45,
"max_wav_value": 32768.0, "c_kl": 1.0
"sampling_rate": 32000, },
"filter_length": 1024, "data": {
"hop_length": 320, "max_wav_value": 32768.0,
"win_length": 1024, "sampling_rate": 32000,
"n_mel_channels": 80, "filter_length": 1024,
"mel_fmin": 0.0, "hop_length": 320,
"mel_fmax": null "win_length": 1024,
}, "n_mel_channels": 80,
"model": { "mel_fmin": 0.0,
"inter_channels": 192, "mel_fmax": null
"hidden_channels": 192, },
"filter_channels": 768, "model": {
"n_heads": 2, "inter_channels": 192,
"n_layers": 6, "hidden_channels": 192,
"kernel_size": 3, "filter_channels": 768,
"p_dropout": 0, "n_heads": 2,
"resblock": "1", "n_layers": 6,
"resblock_kernel_sizes": [3,7,11], "kernel_size": 3,
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "p_dropout": 0,
"upsample_rates": [10,8,2,2], "resblock": "1",
"upsample_initial_channel": 512, "resblock_kernel_sizes": [
"upsample_kernel_sizes": [20,16,4,4], 3,
"use_spectral_norm": false, 7,
"gin_channels": 256, 11
"spk_embed_dim": 109 ],
} "resblock_dilation_sizes": [
} [
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
8,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
20,
16,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,79 @@
{ {
"train": { "train": {
"log_interval": 200, "log_interval": 200,
"seed": 1234, "seed": 1234,
"epochs": 20000, "epochs": 20000,
"learning_rate": 1e-4, "learning_rate": 0.0001,
"betas": [0.8, 0.99], "betas": [
"eps": 1e-9, 0.8,
"batch_size": 4, 0.99
"fp16_run": true, ],
"lr_decay": 0.999875, "eps": 1e-09,
"segment_size": 17280, "batch_size": 4,
"init_lr_ratio": 1, "fp16_run": false,
"warmup_epochs": 0, "lr_decay": 0.999875,
"c_mel": 45, "segment_size": 17280,
"c_kl": 1.0 "init_lr_ratio": 1,
}, "warmup_epochs": 0,
"data": { "c_mel": 45,
"max_wav_value": 32768.0, "c_kl": 1.0
"sampling_rate": 48000, },
"filter_length": 2048, "data": {
"hop_length": 480, "max_wav_value": 32768.0,
"win_length": 2048, "sampling_rate": 48000,
"n_mel_channels": 128, "filter_length": 2048,
"mel_fmin": 0.0, "hop_length": 480,
"mel_fmax": null "win_length": 2048,
}, "n_mel_channels": 128,
"model": { "mel_fmin": 0.0,
"inter_channels": 192, "mel_fmax": null
"hidden_channels": 192, },
"filter_channels": 768, "model": {
"n_heads": 2, "inter_channels": 192,
"n_layers": 6, "hidden_channels": 192,
"kernel_size": 3, "filter_channels": 768,
"p_dropout": 0, "n_heads": 2,
"resblock": "1", "n_layers": 6,
"resblock_kernel_sizes": [3,7,11], "kernel_size": 3,
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "p_dropout": 0,
"upsample_rates": [12,10,2,2], "resblock": "1",
"upsample_initial_channel": 512, "resblock_kernel_sizes": [
"upsample_kernel_sizes": [24,20,4,4], 3,
"use_spectral_norm": false, 7,
"gin_channels": 256, 11
"spk_embed_dim": 109 ],
} "resblock_dilation_sizes": [
} [
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
12,
10,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
24,
20,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -8,7 +8,7 @@ from tqdm import tqdm
def load_inputs(path, device, is_half=False): def load_inputs(path, device, is_half=False):
parm = torch.load(path, map_location=torch.device("cpu")) parm = torch.load(path, map_location=torch.device("cpu"), weights_only=False)
for key in parm.keys(): for key in parm.keys():
parm[key] = parm[key].to(device) parm[key] = parm[key].to(device)
if is_half and parm[key].dtype == torch.float32: if is_half and parm[key].dtype == torch.float32:

View File

@@ -5,7 +5,7 @@ def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
from infer.lib.rmvpe import E2E from infer.lib.rmvpe import E2E
model = E2E(4, 1, (2, 2)) model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path, map_location=device) ckpt = torch.load(model_path, map_location=device, weights_only=False)
model.load_state_dict(ckpt) model.load_state_dict(ckpt)
model.eval() model.eval()
model = model.to(device) model = model.to(device)

View File

@@ -9,7 +9,7 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
SynthesizerTrnMs768NSFsid_nono, SynthesizerTrnMs768NSFsid_nono,
) )
cpt = torch.load(pth_path, map_location=torch.device("cpu")) cpt = torch.load(pth_path, map_location=torch.device("cpu"), weights_only=False)
# tgt_sr = cpt["config"][-1] # tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
if_f0 = cpt.get("f0", 1) if_f0 = cpt.get("f0", 1)

File diff suppressed because it is too large Load Diff

View File

@@ -39,7 +39,7 @@ class AudioPreprocess:
else CascadedNet( else CascadedNet(
self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48 self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
) )
.load_state_dict(torch.load(model_path, map_location="cpu")) .load_state_dict(torch.load(model_path, map_location="cpu", weights_only=False))
.eval() .eval()
) )
if self.config.is_half: if self.config.is_half:

View File

@@ -120,10 +120,10 @@ class VC:
raise FileNotFoundError("hubert_path not found.") raise FileNotFoundError("hubert_path not found.")
if hasattr(input_audio_path, "name"): if hasattr(input_audio_path, "name"):
input_audio_path = input_audio_path.name input_audio_path = str(input_audio_path)
elif not isinstance(input_audio_path, str): elif not isinstance(input_audio_path, str):
raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}") raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}")
if not os.path.exists(input_audio_path): if not os.path.exists(input_audio_path):
raise FileNotFoundError("input_audio_path not found.") raise FileNotFoundError("input_audio_path not found.")

View File

@@ -1,4 +1,5 @@
import os import os
import torch
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
@@ -20,10 +21,23 @@ def get_index_path_from_model(sid):
def load_hubert(config, hubert_path: str): def load_hubert(config, hubert_path: str):
models, _, _ = checkpoint_utils.load_model_ensemble_and_task( # PyTorch 2.6+ changed weights_only default to True, which breaks fairseq checkpoints
[hubert_path], # Monkey-patch torch.load to use weights_only=False for fairseq
suffix="", original_torch_load = torch.load
)
def patched_torch_load(f, map_location=None, *args, **kwargs):
kwargs.setdefault('weights_only', False)
return original_torch_load(f, map_location=map_location, *args, **kwargs)
torch.load = patched_torch_load
try:
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
[hubert_path],
suffix="",
)
finally:
torch.load = original_torch_load
hubert_model = models[0] hubert_model = models[0]
hubert_model = hubert_model.to(config.device) hubert_model = hubert_model.to(config.device)
hubert_model = hubert_model.half() if config.is_half else hubert_model.float() hubert_model = hubert_model.half() if config.is_half else hubert_model.float()

View File

@@ -1,4 +1,6 @@
import json import json
import logging
import tempfile
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@@ -11,6 +13,9 @@ from base64 import b64encode
from rvc.modules.vc.modules import VC from rvc.modules.vc.modules import VC
import glob import glob
import os import os
import soundfile as sf
logger = logging.getLogger(__name__)
router = APIRouter() router = APIRouter()
from dotenv import load_dotenv from dotenv import load_dotenv
@@ -74,3 +79,114 @@ def inference(
"audio": b64encode(wv.read()).decode("utf-8"), "audio": b64encode(wv.read()).decode("utf-8"),
} }
) )
@router.post("/tts-inference")
def tts_inference(
text: str = Body(..., description="The text to synthesize"),
language: str = Body(
"Chinese",
description="Language code",
enum=[
"Chinese",
"English",
"Japanese",
"Korean",
"German",
"French",
"Russian",
"Portuguese",
"Spanish",
"Italian",
],
),
speaker: str = Body("Vivian", description="Speaker/voice profile name"),
instruct: str = Body("", description="Natural language instruction for controlling timbre, emotion, and prosody"),
modelpath: Path
| UploadFile = Body(
...,
enum=[
os.path.basename(file)
for file in glob.glob(f"{os.getenv('weight_root')}/*")
],
),
res_type: str = Query("blob", enum=["blob", "json"]),
sid: int = 0,
f0_up_key: int = 0,
f0_method: str = Query(
"rmvpe", enum=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"]
),
f0_file: Path | None = None,
index_file: Path | None = None,
index_rate: float = 0.75,
filter_radius: int = 3,
resample_sr: int = 0,
rms_mix_rate: float = 0.25,
protect: float = 0.33,
):
"""
Perform TTS using Qwen3-TTS followed by voice conversion inference.
First generates speech from text using Qwen3-TTS, then applies voice conversion
to transform the generated speech to the target voice.
"""
from qwen_tts import Qwen3TTSModel
import torch
# Load Qwen3-TTS model
tts_model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
device_map="cuda:0" if torch.cuda.is_available() else "cpu",
dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
)
# Generate TTS audio
wavs, sr = tts_model.generate_custom_voice(
text=text,
language=language,
speaker=speaker,
instruct=instruct,
)
# Save TTS output to temporary file
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tmp_path = tmp.name
sf.write(tmp_path, wavs[0], sr)
tmp.close()
try:
# Run voice conversion on the generated audio
vc = VC()
vc.get_vc(modelpath)
tgt_sr, audio_opt, times, _ = vc.vc_inference(
sid,
tmp_path,
f0_up_key,
f0_method,
f0_file,
index_file,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
)
wavfile.write(wv := BytesIO(), tgt_sr, audio_opt)
print(times)
if res_type == "blob":
return responses.StreamingResponse(
wv,
media_type="audio/wav",
headers={"Content-Disposition": "attachment; filename=tts_inference.wav"},
)
else:
return JSONResponse(
{
"time": json.loads(json.dumps(times)),
"audio": b64encode(wv.read()).decode("utf-8"),
}
)
finally:
# Clean up temporary file
os.unlink(tmp_path)