add endpoint for Qwen3-TTS -> RVC

This commit is contained in:
2026-03-01 15:32:48 -08:00
parent 7b284a6346
commit 1f9832ac0d
15 changed files with 1199 additions and 3337 deletions

0
assets-download.sh Normal file → Executable file
View File

2437
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,46 +1,81 @@
{
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 1e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 4,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 12800,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 1024,
"hop_length": 320,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [10,4,2,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4,4],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 4,
"fp16_run": false,
"lr_decay": 0.999875,
"segment_size": 12800,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 1024,
"hop_length": 320,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
4,
2,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,79 @@
{
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 1e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 4,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 12800,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 40000,
"filter_length": 2048,
"hop_length": 400,
"win_length": 2048,
"n_mel_channels": 125,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [10,10,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 4,
"fp16_run": false,
"lr_decay": 0.999875,
"segment_size": 12800,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 40000,
"filter_length": 2048,
"hop_length": 400,
"win_length": 2048,
"n_mel_channels": 125,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
10,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,81 @@
{
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 1e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 4,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 11520,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 48000,
"filter_length": 2048,
"hop_length": 480,
"win_length": 2048,
"n_mel_channels": 128,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [10,6,2,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4,4],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 4,
"fp16_run": false,
"lr_decay": 0.999875,
"segment_size": 11520,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 48000,
"filter_length": 2048,
"hop_length": 480,
"win_length": 2048,
"n_mel_channels": 128,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
6,
2,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,79 @@
{
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 1e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 4,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 12800,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 1024,
"hop_length": 320,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [10,8,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [20,16,4,4],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 4,
"fp16_run": false,
"lr_decay": 0.999875,
"segment_size": 12800,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 1024,
"hop_length": 320,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
8,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
20,
16,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -1,46 +1,79 @@
{
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 1e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 4,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 17280,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 48000,
"filter_length": 2048,
"hop_length": 480,
"win_length": 2048,
"n_mel_channels": 128,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [12,10,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [24,20,4,4],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}
"train": {
"log_interval": 200,
"seed": 1234,
"epochs": 20000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 4,
"fp16_run": false,
"lr_decay": 0.999875,
"segment_size": 17280,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 48000,
"filter_length": 2048,
"hop_length": 480,
"win_length": 2048,
"n_mel_channels": 128,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
12,
10,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
24,
20,
4,
4
],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}

View File

@@ -8,7 +8,7 @@ from tqdm import tqdm
def load_inputs(path, device, is_half=False):
parm = torch.load(path, map_location=torch.device("cpu"))
parm = torch.load(path, map_location=torch.device("cpu"), weights_only=False)
for key in parm.keys():
parm[key] = parm[key].to(device)
if is_half and parm[key].dtype == torch.float32:

View File

@@ -5,7 +5,7 @@ def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
from infer.lib.rmvpe import E2E
model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path, map_location=device)
ckpt = torch.load(model_path, map_location=device, weights_only=False)
model.load_state_dict(ckpt)
model.eval()
model = model.to(device)

View File

@@ -9,7 +9,7 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
SynthesizerTrnMs768NSFsid_nono,
)
cpt = torch.load(pth_path, map_location=torch.device("cpu"))
cpt = torch.load(pth_path, map_location=torch.device("cpu"), weights_only=False)
# tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
if_f0 = cpt.get("f0", 1)

File diff suppressed because it is too large Load Diff

View File

@@ -39,7 +39,7 @@ class AudioPreprocess:
else CascadedNet(
self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
)
.load_state_dict(torch.load(model_path, map_location="cpu"))
.load_state_dict(torch.load(model_path, map_location="cpu", weights_only=False))
.eval()
)
if self.config.is_half:

View File

@@ -120,10 +120,10 @@ class VC:
raise FileNotFoundError("hubert_path not found.")
if hasattr(input_audio_path, "name"):
input_audio_path = input_audio_path.name
input_audio_path = str(input_audio_path)
elif not isinstance(input_audio_path, str):
raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}")
if not os.path.exists(input_audio_path):
raise FileNotFoundError("input_audio_path not found.")

View File

@@ -1,4 +1,5 @@
import os
import torch
from fairseq import checkpoint_utils
@@ -20,10 +21,23 @@ def get_index_path_from_model(sid):
def load_hubert(config, hubert_path: str):
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
[hubert_path],
suffix="",
)
# PyTorch 2.6+ changed weights_only default to True, which breaks fairseq checkpoints
# Monkey-patch torch.load to use weights_only=False for fairseq
original_torch_load = torch.load
def patched_torch_load(f, map_location=None, *args, **kwargs):
kwargs.setdefault('weights_only', False)
return original_torch_load(f, map_location=map_location, *args, **kwargs)
torch.load = patched_torch_load
try:
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
[hubert_path],
suffix="",
)
finally:
torch.load = original_torch_load
hubert_model = models[0]
hubert_model = hubert_model.to(config.device)
hubert_model = hubert_model.half() if config.is_half else hubert_model.float()

View File

@@ -1,4 +1,6 @@
import json
import logging
import tempfile
from io import BytesIO
from pathlib import Path
@@ -11,6 +13,9 @@ from base64 import b64encode
from rvc.modules.vc.modules import VC
import glob
import os
import soundfile as sf
logger = logging.getLogger(__name__)
router = APIRouter()
from dotenv import load_dotenv
@@ -74,3 +79,114 @@ def inference(
"audio": b64encode(wv.read()).decode("utf-8"),
}
)
@router.post("/tts-inference")
def tts_inference(
text: str = Body(..., description="The text to synthesize"),
language: str = Body(
"Chinese",
description="Language code",
enum=[
"Chinese",
"English",
"Japanese",
"Korean",
"German",
"French",
"Russian",
"Portuguese",
"Spanish",
"Italian",
],
),
speaker: str = Body("Vivian", description="Speaker/voice profile name"),
instruct: str = Body("", description="Natural language instruction for controlling timbre, emotion, and prosody"),
modelpath: Path
| UploadFile = Body(
...,
enum=[
os.path.basename(file)
for file in glob.glob(f"{os.getenv('weight_root')}/*")
],
),
res_type: str = Query("blob", enum=["blob", "json"]),
sid: int = 0,
f0_up_key: int = 0,
f0_method: str = Query(
"rmvpe", enum=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"]
),
f0_file: Path | None = None,
index_file: Path | None = None,
index_rate: float = 0.75,
filter_radius: int = 3,
resample_sr: int = 0,
rms_mix_rate: float = 0.25,
protect: float = 0.33,
):
"""
Perform TTS using Qwen3-TTS followed by voice conversion inference.
First generates speech from text using Qwen3-TTS, then applies voice conversion
to transform the generated speech to the target voice.
"""
from qwen_tts import Qwen3TTSModel
import torch
# Load Qwen3-TTS model
tts_model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
device_map="cuda:0" if torch.cuda.is_available() else "cpu",
dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
)
# Generate TTS audio
wavs, sr = tts_model.generate_custom_voice(
text=text,
language=language,
speaker=speaker,
instruct=instruct,
)
# Save TTS output to temporary file
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tmp_path = tmp.name
sf.write(tmp_path, wavs[0], sr)
tmp.close()
try:
# Run voice conversion on the generated audio
vc = VC()
vc.get_vc(modelpath)
tgt_sr, audio_opt, times, _ = vc.vc_inference(
sid,
tmp_path,
f0_up_key,
f0_method,
f0_file,
index_file,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
)
wavfile.write(wv := BytesIO(), tgt_sr, audio_opt)
print(times)
if res_type == "blob":
return responses.StreamingResponse(
wv,
media_type="audio/wav",
headers={"Content-Disposition": "attachment; filename=tts_inference.wav"},
)
else:
return JSONResponse(
{
"time": json.loads(json.dumps(times)),
"audio": b64encode(wv.read()).decode("utf-8"),
}
)
finally:
# Clean up temporary file
os.unlink(tmp_path)