add endpoint for Qwen3-TTS -> RVC

2026-03-01 15:32:48 -08:00
parent 7b284a6346
commit 1f9832ac0d
15 changed files with 1199 additions and 3337 deletions
@@ -1,46 +1,81 @@
 {
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 12800,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 32000,
-    "filter_length": 1024,
-    "hop_length": 320,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [10,4,2,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}
+    "train": {
+        "log_interval": 200,
+        "seed": 1234,
+        "epochs": 20000,
+        "learning_rate": 0.0001,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "batch_size": 4,
+        "fp16_run": false,
+        "lr_decay": 0.999875,
+        "segment_size": 12800,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0
+    },
+    "data": {
+        "max_wav_value": 32768.0,
+        "sampling_rate": 32000,
+        "filter_length": 1024,
+        "hop_length": 320,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            10,
+            4,
+            2,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4,
+            4
+        ],
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "spk_embed_dim": 109
+    }
+}
@@ -1,46 +1,79 @@
 {
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 12800,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 40000,
-    "filter_length": 2048,
-    "hop_length": 400,
-    "win_length": 2048,
-    "n_mel_channels": 125,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [10,10,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}
+    "train": {
+        "log_interval": 200,
+        "seed": 1234,
+        "epochs": 20000,
+        "learning_rate": 0.0001,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "batch_size": 4,
+        "fp16_run": false,
+        "lr_decay": 0.999875,
+        "segment_size": 12800,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0
+    },
+    "data": {
+        "max_wav_value": 32768.0,
+        "sampling_rate": 40000,
+        "filter_length": 2048,
+        "hop_length": 400,
+        "win_length": 2048,
+        "n_mel_channels": 125,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            10,
+            10,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "spk_embed_dim": 109
+    }
+}
@@ -1,46 +1,81 @@
 {
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 11520,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 48000,
-    "filter_length": 2048,
-    "hop_length": 480,
-    "win_length": 2048,
-    "n_mel_channels": 128,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [10,6,2,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}
+    "train": {
+        "log_interval": 200,
+        "seed": 1234,
+        "epochs": 20000,
+        "learning_rate": 0.0001,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "batch_size": 4,
+        "fp16_run": false,
+        "lr_decay": 0.999875,
+        "segment_size": 11520,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0
+    },
+    "data": {
+        "max_wav_value": 32768.0,
+        "sampling_rate": 48000,
+        "filter_length": 2048,
+        "hop_length": 480,
+        "win_length": 2048,
+        "n_mel_channels": 128,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            10,
+            6,
+            2,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4,
+            4
+        ],
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "spk_embed_dim": 109
+    }
+}
@@ -1,46 +1,79 @@
 {
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 12800,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 32000,
-    "filter_length": 1024,
-    "hop_length": 320,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [10,8,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [20,16,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}
+    "train": {
+        "log_interval": 200,
+        "seed": 1234,
+        "epochs": 20000,
+        "learning_rate": 0.0001,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "batch_size": 4,
+        "fp16_run": false,
+        "lr_decay": 0.999875,
+        "segment_size": 12800,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0
+    },
+    "data": {
+        "max_wav_value": 32768.0,
+        "sampling_rate": 32000,
+        "filter_length": 1024,
+        "hop_length": 320,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            10,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            20,
+            16,
+            4,
+            4
+        ],
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "spk_embed_dim": 109
+    }
+}
@@ -1,46 +1,79 @@
 {
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 17280,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 48000,
-    "filter_length": 2048,
-    "hop_length": 480,
-    "win_length": 2048,
-    "n_mel_channels": 128,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [12,10,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [24,20,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}
+    "train": {
+        "log_interval": 200,
+        "seed": 1234,
+        "epochs": 20000,
+        "learning_rate": 0.0001,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "batch_size": 4,
+        "fp16_run": false,
+        "lr_decay": 0.999875,
+        "segment_size": 17280,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0
+    },
+    "data": {
+        "max_wav_value": 32768.0,
+        "sampling_rate": 48000,
+        "filter_length": 2048,
+        "hop_length": 480,
+        "win_length": 2048,
+        "n_mel_channels": 128,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            12,
+            10,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            24,
+            20,
+            4,
+            4
+        ],
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "spk_embed_dim": 109
+    }
+}
@@ -8,7 +8,7 @@ from tqdm import tqdm


 def load_inputs(path, device, is_half=False):
-    parm = torch.load(path, map_location=torch.device("cpu"))
+    parm = torch.load(path, map_location=torch.device("cpu"), weights_only=False)
    for key in parm.keys():
        parm[key] = parm[key].to(device)
        if is_half and parm[key].dtype == torch.float32:
@@ -5,7 +5,7 @@ def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
    from infer.lib.rmvpe import E2E

    model = E2E(4, 1, (2, 2))
-    ckpt = torch.load(model_path, map_location=device)
+    ckpt = torch.load(model_path, map_location=device, weights_only=False)
    model.load_state_dict(ckpt)
    model.eval()
    model = model.to(device)
@@ -9,7 +9,7 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
        SynthesizerTrnMs768NSFsid_nono,
    )

-    cpt = torch.load(pth_path, map_location=torch.device("cpu"))
+    cpt = torch.load(pth_path, map_location=torch.device("cpu"), weights_only=False)
    # tgt_sr = cpt["config"][-1]
    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
    if_f0 = cpt.get("f0", 1)
@@ -39,7 +39,7 @@ class AudioPreprocess:
            else CascadedNet(
                self.mp.param["bins"] * 2, 64 if "DeReverb" in model_path else 48
            )
-            .load_state_dict(torch.load(model_path, map_location="cpu"))
+            .load_state_dict(torch.load(model_path, map_location="cpu", weights_only=False))
            .eval()
        )
        if self.config.is_half:
@@ -120,10 +120,10 @@ class VC:
            raise FileNotFoundError("hubert_path not found.")

        if hasattr(input_audio_path, "name"):
-            input_audio_path = input_audio_path.name
+            input_audio_path = str(input_audio_path)
        elif not isinstance(input_audio_path, str):
            raise RuntimeError(f"pathlib.Path or str expected for input_audio_path. Got {type(input_audio_path)}")
-        
+
        if not os.path.exists(input_audio_path):
            raise FileNotFoundError("input_audio_path not found.")
        
@@ -1,4 +1,5 @@
 import os
+import torch

 from fairseq import checkpoint_utils

@@ -20,10 +21,23 @@ def get_index_path_from_model(sid):


 def load_hubert(config, hubert_path: str):
-    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
-        [hubert_path],
-        suffix="",
-    )
+    # PyTorch 2.6+ changed weights_only default to True, which breaks fairseq checkpoints
+    # Monkey-patch torch.load to use weights_only=False for fairseq
+    original_torch_load = torch.load
+
+    def patched_torch_load(f, map_location=None, *args, **kwargs):
+        kwargs.setdefault('weights_only', False)
+        return original_torch_load(f, map_location=map_location, *args, **kwargs)
+
+    torch.load = patched_torch_load
+    try:
+        models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+            [hubert_path],
+            suffix="",
+        )
+    finally:
+        torch.load = original_torch_load
+
    hubert_model = models[0]
    hubert_model = hubert_model.to(config.device)
    hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
@@ -1,4 +1,6 @@
 import json
+import logging
+import tempfile
 from io import BytesIO
 from pathlib import Path

@@ -11,6 +13,9 @@ from base64 import b64encode
 from rvc.modules.vc.modules import VC
 import glob
 import os
+import soundfile as sf
+
+logger = logging.getLogger(__name__)

 router = APIRouter()
 from dotenv import load_dotenv
@@ -74,3 +79,114 @@ def inference(
                "audio": b64encode(wv.read()).decode("utf-8"),
            }
        )
+
+
+@router.post("/tts-inference")
+def tts_inference(
+    text: str = Body(..., description="The text to synthesize"),
+    language: str = Body(
+        "Chinese",
+        description="Language code",
+        enum=[
+            "Chinese",
+            "English",
+            "Japanese",
+            "Korean",
+            "German",
+            "French",
+            "Russian",
+            "Portuguese",
+            "Spanish",
+            "Italian",
+        ],
+    ),
+    speaker: str = Body("Vivian", description="Speaker/voice profile name"),
+    instruct: str = Body("", description="Natural language instruction for controlling timbre, emotion, and prosody"),
+    modelpath: Path
+    | UploadFile = Body(
+        ...,
+        enum=[
+            os.path.basename(file)
+            for file in glob.glob(f"{os.getenv('weight_root')}/*")
+        ],
+    ),
+    res_type: str = Query("blob", enum=["blob", "json"]),
+    sid: int = 0,
+    f0_up_key: int = 0,
+    f0_method: str = Query(
+        "rmvpe", enum=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"]
+    ),
+    f0_file: Path | None = None,
+    index_file: Path | None = None,
+    index_rate: float = 0.75,
+    filter_radius: int = 3,
+    resample_sr: int = 0,
+    rms_mix_rate: float = 0.25,
+    protect: float = 0.33,
+):
+    """
+    Perform TTS using Qwen3-TTS followed by voice conversion inference.
+    
+    First generates speech from text using Qwen3-TTS, then applies voice conversion
+    to transform the generated speech to the target voice.
+    """
+    from qwen_tts import Qwen3TTSModel
+    import torch
+
+    # Load Qwen3-TTS model
+    tts_model = Qwen3TTSModel.from_pretrained(
+        "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+        device_map="cuda:0" if torch.cuda.is_available() else "cpu",
+        dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
+    )
+
+    # Generate TTS audio
+    wavs, sr = tts_model.generate_custom_voice(
+        text=text,
+        language=language,
+        speaker=speaker,
+        instruct=instruct,
+    )
+
+    # Save TTS output to temporary file
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    tmp_path = tmp.name
+    sf.write(tmp_path, wavs[0], sr)
+    tmp.close()
+
+    try:
+        # Run voice conversion on the generated audio
+        vc = VC()
+        vc.get_vc(modelpath)
+        tgt_sr, audio_opt, times, _ = vc.vc_inference(
+            sid,
+            tmp_path,
+            f0_up_key,
+            f0_method,
+            f0_file,
+            index_file,
+            index_rate,
+            filter_radius,
+            resample_sr,
+            rms_mix_rate,
+            protect,
+        )
+        wavfile.write(wv := BytesIO(), tgt_sr, audio_opt)
+        print(times)
+        if res_type == "blob":
+            return responses.StreamingResponse(
+                wv,
+                media_type="audio/wav",
+                headers={"Content-Disposition": "attachment; filename=tts_inference.wav"},
+            )
+        else:
+            return JSONResponse(
+                {
+                    "time": json.loads(json.dumps(times)),
+                    "audio": b64encode(wv.read()).decode("utf-8"),
+                }
+            )
+    finally:
+        # Clean up temporary file
+        os.unlink(tmp_path)