add endpoint for Qwen3-TTS -> RVC

2026-03-01 15:32:48 -08:00
parent 7b284a6346
commit 1f9832ac0d
15 changed files with 1199 additions and 3337 deletions
--- a/assets-download.sh
+++ b/assets-download.sh
--- a/poetry.lock
+++ b/poetry.lock
--- a/rvc/configs/v1/32k.json
+++ b/rvc/configs/v1/32k.json
@@ -1,46 +1,81 @@
 {
-  "train": {
+    "train": {
-    "log_interval": 200,
+        "log_interval": 200,
-    "seed": 1234,
+        "seed": 1234,
-    "epochs": 20000,
+        "epochs": 20000,
-    "learning_rate": 1e-4,
+        "learning_rate": 0.0001,
-    "betas": [0.8, 0.99],
+        "betas": [
-    "eps": 1e-9,
+            0.8,
-    "batch_size": 4,
+            0.99
-    "fp16_run": true,
+        ],
-    "lr_decay": 0.999875,
+        "eps": 1e-09,
-    "segment_size": 12800,
+        "batch_size": 4,
-    "init_lr_ratio": 1,
+        "fp16_run": false,
-    "warmup_epochs": 0,
+        "lr_decay": 0.999875,
-    "c_mel": 45,
+        "segment_size": 12800,
-    "c_kl": 1.0
+        "init_lr_ratio": 1,
-  },
+        "warmup_epochs": 0,
-  "data": {
+        "c_mel": 45,
-    "max_wav_value": 32768.0,
+        "c_kl": 1.0
-    "sampling_rate": 32000,
+    },
-    "filter_length": 1024,
+    "data": {
-    "hop_length": 320,
+        "max_wav_value": 32768.0,
-    "win_length": 1024,
+        "sampling_rate": 32000,
-    "n_mel_channels": 80,
+        "filter_length": 1024,
-    "mel_fmin": 0.0,
+        "hop_length": 320,
-    "mel_fmax": null
+        "win_length": 1024,
-  },
+        "n_mel_channels": 80,
-  "model": {
+        "mel_fmin": 0.0,
-    "inter_channels": 192,
+        "mel_fmax": null
-    "hidden_channels": 192,
+    },
-    "filter_channels": 768,
+    "model": {
-    "n_heads": 2,
+        "inter_channels": 192,
-    "n_layers": 6,
+        "hidden_channels": 192,
-    "kernel_size": 3,
+        "filter_channels": 768,
-    "p_dropout": 0,
+        "n_heads": 2,
-    "resblock": "1",
+        "n_layers": 6,
-    "resblock_kernel_sizes": [3,7,11],
+        "kernel_size": 3,
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+        "p_dropout": 0,
-    "upsample_rates": [10,4,2,2,2],
+        "resblock": "1",
-    "upsample_initial_channel": 512,
+        "resblock_kernel_sizes": [
-    "upsample_kernel_sizes": [16,16,4,4,4],
+            3,
-    "use_spectral_norm": false,
+            7,
-    "gin_channels": 256,
+            11
-    "spk_embed_dim": 109
+        ],
-  }
+        "resblock_dilation_sizes": [
-}
+            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ]
        ],
        "upsample_rates": [
            10,
            4,
            2,
            2,
            2
        ],
        "upsample_initial_channel": 512,
        "upsample_kernel_sizes": [
            16,
            16,
            4,
            4,
            4
        ],
        "use_spectral_norm": false,
        "gin_channels": 256,
        "spk_embed_dim": 109
    }
 }
--- a/rvc/configs/v1/40k.json
+++ b/rvc/configs/v1/40k.json
@@ -1,46 +1,79 @@
 {
-  "train": {
+    "train": {
-    "log_interval": 200,
+        "log_interval": 200,
-    "seed": 1234,
+        "seed": 1234,
-    "epochs": 20000,
+        "epochs": 20000,
-    "learning_rate": 1e-4,
+        "learning_rate": 0.0001,
-    "betas": [0.8, 0.99],
+        "betas": [
-    "eps": 1e-9,
+            0.8,
-    "batch_size": 4,
+            0.99
-    "fp16_run": true,
+        ],
-    "lr_decay": 0.999875,
+        "eps": 1e-09,
-    "segment_size": 12800,
+        "batch_size": 4,
-    "init_lr_ratio": 1,
+        "fp16_run": false,
-    "warmup_epochs": 0,
+        "lr_decay": 0.999875,
-    "c_mel": 45,
+        "segment_size": 12800,
-    "c_kl": 1.0
+        "init_lr_ratio": 1,
-  },
+        "warmup_epochs": 0,
-  "data": {
+        "c_mel": 45,
-    "max_wav_value": 32768.0,
+        "c_kl": 1.0
-    "sampling_rate": 40000,
+    },
-    "filter_length": 2048,
+    "data": {
-    "hop_length": 400,
+        "max_wav_value": 32768.0,
-    "win_length": 2048,
+        "sampling_rate": 40000,
-    "n_mel_channels": 125,
+        "filter_length": 2048,
-    "mel_fmin": 0.0,
+        "hop_length": 400,
-    "mel_fmax": null
+        "win_length": 2048,
-  },
+        "n_mel_channels": 125,
-  "model": {
+        "mel_fmin": 0.0,
-    "inter_channels": 192,
+        "mel_fmax": null
-    "hidden_channels": 192,
+    },
-    "filter_channels": 768,
+    "model": {
-    "n_heads": 2,
+        "inter_channels": 192,
-    "n_layers": 6,
+        "hidden_channels": 192,
-    "kernel_size": 3,
+        "filter_channels": 768,
-    "p_dropout": 0,
+        "n_heads": 2,
-    "resblock": "1",
+        "n_layers": 6,
-    "resblock_kernel_sizes": [3,7,11],
+        "kernel_size": 3,
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+        "p_dropout": 0,
-    "upsample_rates": [10,10,2,2],
+        "resblock": "1",
-    "upsample_initial_channel": 512,
+        "resblock_kernel_sizes": [
-    "upsample_kernel_sizes": [16,16,4,4],
+            3,
-    "use_spectral_norm": false,
+            7,
-    "gin_channels": 256,
+            11
-    "spk_embed_dim": 109
+        ],
-  }
+        "resblock_dilation_sizes": [
-}
+            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ]
        ],
        "upsample_rates": [
            10,
            10,
            2,
            2
        ],
        "upsample_initial_channel": 512,
        "upsample_kernel_sizes": [
            16,
            16,
            4,
            4
        ],
        "use_spectral_norm": false,
        "gin_channels": 256,
        "spk_embed_dim": 109
    }
 }
--- a/rvc/configs/v1/48k.json
+++ b/rvc/configs/v1/48k.json
@@ -1,46 +1,81 @@
 {
-  "train": {
+    "train": {
-    "log_interval": 200,
+        "log_interval": 200,
-    "seed": 1234,
+        "seed": 1234,
-    "epochs": 20000,
+        "epochs": 20000,
-    "learning_rate": 1e-4,
+        "learning_rate": 0.0001,
-    "betas": [0.8, 0.99],
+        "betas": [
-    "eps": 1e-9,
+            0.8,
-    "batch_size": 4,
+            0.99
-    "fp16_run": true,
+        ],
-    "lr_decay": 0.999875,
+        "eps": 1e-09,
-    "segment_size": 11520,
+        "batch_size": 4,
-    "init_lr_ratio": 1,
+        "fp16_run": false,
-    "warmup_epochs": 0,
+        "lr_decay": 0.999875,
-    "c_mel": 45,
+        "segment_size": 11520,
-    "c_kl": 1.0
+        "init_lr_ratio": 1,
-  },
+        "warmup_epochs": 0,
-  "data": {
+        "c_mel": 45,
-    "max_wav_value": 32768.0,
+        "c_kl": 1.0
-    "sampling_rate": 48000,
+    },
-    "filter_length": 2048,
+    "data": {
-    "hop_length": 480,
+        "max_wav_value": 32768.0,
-    "win_length": 2048,
+        "sampling_rate": 48000,
-    "n_mel_channels": 128,
+        "filter_length": 2048,
-    "mel_fmin": 0.0,
+        "hop_length": 480,
-    "mel_fmax": null
+        "win_length": 2048,
-  },
+        "n_mel_channels": 128,
-  "model": {
+        "mel_fmin": 0.0,
-    "inter_channels": 192,
+        "mel_fmax": null
-    "hidden_channels": 192,
+    },
-    "filter_channels": 768,
+    "model": {
-    "n_heads": 2,
+        "inter_channels": 192,
-    "n_layers": 6,
+        "hidden_channels": 192,
-    "kernel_size": 3,
+        "filter_channels": 768,
-    "p_dropout": 0,
+        "n_heads": 2,
-    "resblock": "1",
+        "n_layers": 6,
-    "resblock_kernel_sizes": [3,7,11],
+        "kernel_size": 3,
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+        "p_dropout": 0,
-    "upsample_rates": [10,6,2,2,2],
+        "resblock": "1",
-    "upsample_initial_channel": 512,
+        "resblock_kernel_sizes": [
-    "upsample_kernel_sizes": [16,16,4,4,4],
+            3,
-    "use_spectral_norm": false,
+            7,
-    "gin_channels": 256,
+            11
-    "spk_embed_dim": 109
+        ],
-  }
+        "resblock_dilation_sizes": [
-}
+            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ]
        ],
        "upsample_rates": [
            10,
            6,
            2,
            2,
            2
        ],
        "upsample_initial_channel": 512,
        "upsample_kernel_sizes": [
            16,
            16,
            4,
            4,
            4
        ],
        "use_spectral_norm": false,
        "gin_channels": 256,
        "spk_embed_dim": 109
    }
 }
--- a/rvc/configs/v2/32k.json
+++ b/rvc/configs/v2/32k.json
@@ -1,46 +1,79 @@
 {
-  "train": {
+    "train": {
-    "log_interval": 200,
+        "log_interval": 200,
-    "seed": 1234,
+        "seed": 1234,
-    "epochs": 20000,
+        "epochs": 20000,
-    "learning_rate": 1e-4,
+        "learning_rate": 0.0001,
-    "betas": [0.8, 0.99],
+        "betas": [
-    "eps": 1e-9,
+            0.8,
-    "batch_size": 4,
+            0.99
-    "fp16_run": true,
+        ],
-    "lr_decay": 0.999875,
+        "eps": 1e-09,
-    "segment_size": 12800,
+        "batch_size": 4,
-    "init_lr_ratio": 1,
+        "fp16_run": false,
-    "warmup_epochs": 0,
+        "lr_decay": 0.999875,
-    "c_mel": 45,
+        "segment_size": 12800,
-    "c_kl": 1.0
+        "init_lr_ratio": 1,
-  },
+        "warmup_epochs": 0,
-  "data": {
+        "c_mel": 45,
-    "max_wav_value": 32768.0,
+        "c_kl": 1.0
-    "sampling_rate": 32000,
+    },
-    "filter_length": 1024,
+    "data": {
-    "hop_length": 320,
+        "max_wav_value": 32768.0,
-    "win_length": 1024,
+        "sampling_rate": 32000,
-    "n_mel_channels": 80,
+        "filter_length": 1024,
-    "mel_fmin": 0.0,
+        "hop_length": 320,
-    "mel_fmax": null
+        "win_length": 1024,
-  },
+        "n_mel_channels": 80,
-  "model": {
+        "mel_fmin": 0.0,
-    "inter_channels": 192,
+        "mel_fmax": null
-    "hidden_channels": 192,
+    },
-    "filter_channels": 768,
+    "model": {
-    "n_heads": 2,
+        "inter_channels": 192,
-    "n_layers": 6,
+        "hidden_channels": 192,
-    "kernel_size": 3,
+        "filter_channels": 768,
-    "p_dropout": 0,
+        "n_heads": 2,
-    "resblock": "1",
+        "n_layers": 6,
-    "resblock_kernel_sizes": [3,7,11],
+        "kernel_size": 3,
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+        "p_dropout": 0,
-    "upsample_rates": [10,8,2,2],
+        "resblock": "1",
-    "upsample_initial_channel": 512,
+        "resblock_kernel_sizes": [
-    "upsample_kernel_sizes": [20,16,4,4],
+            3,
-    "use_spectral_norm": false,
+            7,
-    "gin_channels": 256,
+            11
-    "spk_embed_dim": 109
+        ],
-  }
+        "resblock_dilation_sizes": [
-}
+            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ]
        ],
        "upsample_rates": [
            10,
            8,
            2,
            2
        ],
        "upsample_initial_channel": 512,
        "upsample_kernel_sizes": [
            20,
            16,
            4,
            4
        ],
        "use_spectral_norm": false,
        "gin_channels": 256,
        "spk_embed_dim": 109
    }
 }
--- a/rvc/configs/v2/48k.json
+++ b/rvc/configs/v2/48k.json
@@ -1,46 +1,79 @@
 {
-  "train": {
+    "train": {
-    "log_interval": 200,
+        "log_interval": 200,
-    "seed": 1234,
+        "seed": 1234,
-    "epochs": 20000,
+        "epochs": 20000,
-    "learning_rate": 1e-4,
+        "learning_rate": 0.0001,
-    "betas": [0.8, 0.99],
+        "betas": [
-    "eps": 1e-9,
+            0.8,
-    "batch_size": 4,
+            0.99
-    "fp16_run": true,
+        ],
-    "lr_decay": 0.999875,
+        "eps": 1e-09,
-    "segment_size": 17280,
+        "batch_size": 4,
-    "init_lr_ratio": 1,
+        "fp16_run": false,
-    "warmup_epochs": 0,
+        "lr_decay": 0.999875,
-    "c_mel": 45,
+        "segment_size": 17280,
-    "c_kl": 1.0
+        "init_lr_ratio": 1,
-  },
+        "warmup_epochs": 0,
-  "data": {
+        "c_mel": 45,
-    "max_wav_value": 32768.0,
+        "c_kl": 1.0
-    "sampling_rate": 48000,
+    },
-    "filter_length": 2048,
+    "data": {
-    "hop_length": 480,
+        "max_wav_value": 32768.0,
-    "win_length": 2048,
+        "sampling_rate": 48000,
-    "n_mel_channels": 128,
+        "filter_length": 2048,
-    "mel_fmin": 0.0,
+        "hop_length": 480,
-    "mel_fmax": null
+        "win_length": 2048,
-  },
+        "n_mel_channels": 128,
-  "model": {
+        "mel_fmin": 0.0,
-    "inter_channels": 192,
+        "mel_fmax": null
-    "hidden_channels": 192,
+    },
-    "filter_channels": 768,
+    "model": {
-    "n_heads": 2,
+        "inter_channels": 192,
-    "n_layers": 6,
+        "hidden_channels": 192,
-    "kernel_size": 3,
+        "filter_channels": 768,
-    "p_dropout": 0,
+        "n_heads": 2,
-    "resblock": "1",
+        "n_layers": 6,
-    "resblock_kernel_sizes": [3,7,11],
+        "kernel_size": 3,
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+        "p_dropout": 0,
-    "upsample_rates": [12,10,2,2],
+        "resblock": "1",
-    "upsample_initial_channel": 512,
+        "resblock_kernel_sizes": [
-    "upsample_kernel_sizes": [24,20,4,4],
+            3,
-    "use_spectral_norm": false,
+            7,
-    "gin_channels": 256,
+            11
-    "spk_embed_dim": 109
+        ],
-  }
+        "resblock_dilation_sizes": [
-}
+            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ]
        ],
        "upsample_rates": [
            12,
            10,
            2,
            2
        ],
        "upsample_initial_channel": 512,
        "upsample_kernel_sizes": [
            24,
            20,
            4,
            4
        ],
        "use_spectral_norm": false,
        "gin_channels": 256,
        "spk_embed_dim": 109
    }
 }
--- a/rvc/lib/jit/init.py
+++ b/rvc/lib/jit/init.py
@@ -8,7 +8,7 @@ from tqdm import tqdm
 def load_inputs(path, device, is_half=False):
-    parm = torch.load(path, map_location=torch.device("cpu"))
+    parm = torch.load(path, map_location=torch.device("cpu"), weights_only=False)
    for key in parm.keys():
        parm[key] = parm[key].to(device)
        if is_half and parm[key].dtype == torch.float32:
--- a/rvc/lib/jit/get_rmvpe.py
+++ b/rvc/lib/jit/get_rmvpe.py
@@ -5,7 +5,7 @@ def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
    from infer.lib.rmvpe import E2E
    model = E2E(4, 1, (2, 2))
-    ckpt = torch.load(model_path, map_location=device)
+    ckpt = torch.load(model_path, map_location=device, weights_only=False)
    model.load_state_dict(ckpt)
    model.eval()
    model = model.to(device)
--- a/rvc/lib/jit/get_synthesizer.py
+++ b/rvc/lib/jit/get_synthesizer.py
@@ -9,7 +9,7 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
        SynthesizerTrnMs768NSFsid_nono,
    )
-    cpt = torch.load(pth_path, map_location=torch.device("cpu"))
+    cpt = torch.load(pth_path, map_location=torch.device("cpu"), weights_only=False)
    # tgt_sr = cpt["config"][-1]
    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
    if_f0 = cpt.get("f0", 1)
--- a/rvc/lib/rmvpe.py
+++ b/rvc/lib/rmvpe.py
--- a/rvc/modules/uvr5/vr.py
+++ b/rvc/modules/uvr5/vr.py
@@ -39,7 +39,7 @@ class AudioPreprocess:
            else CascadedNet(
                self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
            )
-            .load_state_dict(torch.load(model_path, map_location="cpu"))
+            .load_state_dict(torch.load(model_path, map_location="cpu", weights_only=False))
            .eval()
        )
        if self.config.is_half:
--- a/rvc/modules/vc/modules.py
+++ b/rvc/modules/vc/modules.py
@@ -120,10 +120,10 @@ class VC:
            raise FileNotFoundError("hubert_path not found.")
        if hasattr(input_audio_path, "name"):
-            input_audio_path = input_audio_path.name
+            input_audio_path = str(input_audio_path)
        elif not isinstance(input_audio_path, str):
            raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}")
-        
+
        if not os.path.exists(input_audio_path):
            raise FileNotFoundError("input_audio_path not found.")
--- a/rvc/modules/vc/utils.py
+++ b/rvc/modules/vc/utils.py
@@ -1,4 +1,5 @@
 import os
 import torch
 from fairseq import checkpoint_utils
@@ -20,10 +21,23 @@ def get_index_path_from_model(sid):
 def load_hubert(config, hubert_path: str):
-    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+    # PyTorch 2.6+ changed weights_only default to True, which breaks fairseq checkpoints
-        [hubert_path],
+    # Monkey-patch torch.load to use weights_only=False for fairseq
-        suffix="",
+    original_torch_load = torch.load
-    )
+
    def patched_torch_load(f, map_location=None, *args, **kwargs):
        kwargs.setdefault('weights_only', False)
        return original_torch_load(f, map_location=map_location, *args, **kwargs)
    torch.load = patched_torch_load
    try:
        models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
            [hubert_path],
            suffix="",
        )
    finally:
        torch.load = original_torch_load
    hubert_model = models[0]
    hubert_model = hubert_model.to(config.device)
    hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
--- a/rvc/wrapper/api/endpoints/inference.py
+++ b/rvc/wrapper/api/endpoints/inference.py
@@ -1,4 +1,6 @@
 import json
 import logging
 import tempfile
 from io import BytesIO
 from pathlib import Path
@@ -11,6 +13,9 @@ from base64 import b64encode
 from rvc.modules.vc.modules import VC
 import glob
 import os
 import soundfile as sf
 logger = logging.getLogger(__name__)
 router = APIRouter()
 from dotenv import load_dotenv
@@ -74,3 +79,114 @@ def inference(
                "audio": b64encode(wv.read()).decode("utf-8"),
            }
        )
@router.post("/tts-inference")
 def tts_inference(
    text: str = Body(..., description="The text to synthesize"),
    language: str = Body(
        "Chinese",
        description="Language code",
        enum=[
            "Chinese",
            "English",
            "Japanese",
            "Korean",
            "German",
            "French",
            "Russian",
            "Portuguese",
            "Spanish",
            "Italian",
        ],
    ),
    speaker: str = Body("Vivian", description="Speaker/voice profile name"),
    instruct: str = Body("", description="Natural language instruction for controlling timbre, emotion, and prosody"),
    modelpath: Path
    | UploadFile = Body(
        ...,
        enum=[
            os.path.basename(file)
            for file in glob.glob(f"{os.getenv('weight_root')}/*")
        ],
    ),
    res_type: str = Query("blob", enum=["blob", "json"]),
    sid: int = 0,
    f0_up_key: int = 0,
    f0_method: str = Query(
        "rmvpe", enum=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"]
    ),
    f0_file: Path | None = None,
    index_file: Path | None = None,
    index_rate: float = 0.75,
    filter_radius: int = 3,
    resample_sr: int = 0,
    rms_mix_rate: float = 0.25,
    protect: float = 0.33,
 ):
    """
    Perform TTS using Qwen3-TTS followed by voice conversion inference.
    First generates speech from text using Qwen3-TTS, then applies voice conversion
    to transform the generated speech to the target voice.
    """
    from qwen_tts import Qwen3TTSModel
    import torch
    # Load Qwen3-TTS model
    tts_model = Qwen3TTSModel.from_pretrained(
        "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
        device_map="cuda:0" if torch.cuda.is_available() else "cpu",
        dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
    )
    # Generate TTS audio
    wavs, sr = tts_model.generate_custom_voice(
        text=text,
        language=language,
        speaker=speaker,
        instruct=instruct,
    )
    # Save TTS output to temporary file
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    tmp_path = tmp.name
    sf.write(tmp_path, wavs[0], sr)
    tmp.close()
    try:
        # Run voice conversion on the generated audio
        vc = VC()
        vc.get_vc(modelpath)
        tgt_sr, audio_opt, times, _ = vc.vc_inference(
            sid,
            tmp_path,
            f0_up_key,
            f0_method,
            f0_file,
            index_file,
            index_rate,
            filter_radius,
            resample_sr,
            rms_mix_rate,
            protect,
        )
        wavfile.write(wv := BytesIO(), tgt_sr, audio_opt)
        print(times)
        if res_type == "blob":
            return responses.StreamingResponse(
                wv,
                media_type="audio/wav",
                headers={"Content-Disposition": "attachment; filename=tts_inference.wav"},
            )
        else:
            return JSONResponse(
                {
                    "time": json.loads(json.dumps(times)),
                    "audio": b64encode(wv.read()).decode("utf-8"),
                }
            )
    finally:
        # Clean up temporary file
        os.unlink(tmp_path)