add inference cli

2024-01-20 22:48:15 +09:00
parent 95d989827d
commit 6d759b4b96
13 changed files with 276 additions and 4 deletions
@@ -19,3 +19,29 @@ An easy-to-use Voice Conversion framework based on VITS.<br><br>

 > [!NOTE]
 > Currently under development... Provided as a library and API in rvc
+
+## Installation and usage
+
+### CLI Usage
+
+#### Inference Audio
+
+```sh
+rvc infer -m {model.pth} -i {input.wav} -o {output.wav}
+```
+
+| option        | type         | default value | description                                                                                                                                                                                                                                    | require |
+|---------------|--------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|
+| modelPath     | Path         |               | Model path or filename (reads in the directory set in env)                                                                                                                                                                                     | *       |
+| inputPath     | Path         |               | Input audio path or folder                                                                                                                                                                                                                     | *       |
+| outputPath    | Path         |               | Output audio path or folder                                                                                                                                                                                                                    | *       |
+| sid           | int          | 0             | Speaker/Singer ID                                                                                                                                                                                                                              |         |
+| f0_up_key     | int          | 0             | Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12)                                                                                                                                                      |         |
+| f0_method     | str          | rmvpe         | pitch extraction algorithm (pm, harvest, crepe, rmvpe                                                                                                                                                                                          |         |
+| f0_file       | Path \| None | None          | F0 curve file (optional). One pitch per line. Replaces the default F0 and pitch modulation                                                                                                                                                     |         |
+| index_file    | Path \| None | None          | Path to the feature index file                                                                                                                                                                                                                 |         |
+| index_rate    | float        | 0.75          | Search feature ratio (controls accent strength, too high has artifacting)                                                                                                                                                                      |         |
+| filter_radius | int          | 3             | If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness                                                                                                               |         |
+| resample_sr   | int          | 0             | Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling                                                                                                                                              |         |
+| rms_mix_rate  | float        | 0.25          | Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume |         |
+| protect       | float        | 0.33          | Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy                                 |         |
@@ -1257,6 +1257,26 @@ files = [
 [package.dependencies]
 numpy = ">=1.7.0"

+[[package]]
+name = "protobuf"
+version = "4.25.2"
+description = ""
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "protobuf-4.25.2-cp310-abi3-win32.whl", hash = "sha256:b50c949608682b12efb0b2717f53256f03636af5f60ac0c1d900df6213910fd6"},
+    {file = "protobuf-4.25.2-cp310-abi3-win_amd64.whl", hash = "sha256:8f62574857ee1de9f770baf04dde4165e30b15ad97ba03ceac65f760ff018ac9"},
+    {file = "protobuf-4.25.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:2db9f8fa64fbdcdc93767d3cf81e0f2aef176284071507e3ede160811502fd3d"},
+    {file = "protobuf-4.25.2-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:10894a2885b7175d3984f2be8d9850712c57d5e7587a2410720af8be56cdaf62"},
+    {file = "protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fc381d1dd0516343f1440019cedf08a7405f791cd49eef4ae1ea06520bc1c020"},
+    {file = "protobuf-4.25.2-cp38-cp38-win32.whl", hash = "sha256:33a1aeef4b1927431d1be780e87b641e322b88d654203a9e9d93f218ee359e61"},
+    {file = "protobuf-4.25.2-cp38-cp38-win_amd64.whl", hash = "sha256:47f3de503fe7c1245f6f03bea7e8d3ec11c6c4a2ea9ef910e3221c8a15516d62"},
+    {file = "protobuf-4.25.2-cp39-cp39-win32.whl", hash = "sha256:5e5c933b4c30a988b52e0b7c02641760a5ba046edc5e43d3b94a74c9fc57c1b3"},
+    {file = "protobuf-4.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:d66a769b8d687df9024f2985d5137a337f957a0916cf5464d1513eee96a63ff0"},
+    {file = "protobuf-4.25.2-py3-none-any.whl", hash = "sha256:a8b7a98d4ce823303145bf3c1a8bdb0f2f4642a414b196f04ad9853ed0c8f830"},
+    {file = "protobuf-4.25.2.tar.gz", hash = "sha256:fe599e175cb347efc8ee524bcd4b902d11f7262c0e569ececcb89995c15f0a5e"},
+]
+
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -1748,6 +1768,22 @@ files = [
 [package.extras]
 widechars = ["wcwidth"]

+[[package]]
+name = "tensorboardx"
+version = "2.6.2.2"
+description = "TensorBoardX lets you watch Tensors Flow without Tensorflow"
+optional = false
+python-versions = "*"
+files = [
+    {file = "tensorboardX-2.6.2.2-py2.py3-none-any.whl", hash = "sha256:160025acbf759ede23fd3526ae9d9bfbfd8b68eb16c38a010ebe326dc6395db8"},
+    {file = "tensorboardX-2.6.2.2.tar.gz", hash = "sha256:c6476d7cd0d529b0b72f4acadb1269f9ed8b22f441e87a84f2a3b940bb87b666"},
+]
+
+[package.dependencies]
+numpy = "*"
+packaging = "*"
+protobuf = ">=3.20"
+
 [[package]]
 name = "threadpoolctl"
 version = "3.2.0"
@@ -1899,4 +1935,4 @@ zstd = ["zstandard (>=0.18.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.11.2"
-content-hash = "e6513e4097292f4085650c7b341b531b648554dd5bcdd7441e73b5eb3ef543fd"
+content-hash = "4696062dd04776d959869b2425e72e760c3fc7eb64e7dbe3fc3f6d510ada0542"
@@ -19,6 +19,7 @@ faiss-cpu = "^1.7.4"
 python-dotenv = "^1.0.0"
 pydub = "^0.25.1"
 click = "^8.1.7"
+tensorboardx = "^2.6.2.2"

 [project.scripts]
 rvc = "rvc:cli"
@@ -0,0 +1,30 @@
+import re
+from typing import Optional, Pattern
+
+import click
+
+from rvc.cli.handler.infer import infer
+from rvc.cli.handler.train import train
+from rvc.cli.handler.uvr5 import uvr
+
+from rvc.cli.utils.dlmodel import dlmodel
+from rvc.cli.utils.env import env
+from rvc.cli.utils.initialize import initialize
+
+
+@click.group(
+    context_settings={"help_option_names": ["-h", "--help"]},
+    help="rvc cli feature list",
+)
+def cli():
+    pass
+
+
+if __name__ == "__main__":
+    cli.add_command(infer)
+    cli.add_command(train)
+    cli.add_command(uvr)
+    cli.add_command(dlmodel)
+    cli.add_command(env)
+    cli.add_command(initialize)
+    cli()
@@ -0,0 +1,131 @@
+import logging
+from pathlib import Path
+
+import click
+from dotenv import load_dotenv
+from scipy.io import wavfile
+
+from rvc.modules.vc.modules import VC
+
+
+logging.getLogger("numba").setLevel(logging.WARNING)
+
+
+@click.command(
+    context_settings={"help_option_names": ["-h", "--help"]},
+    help="inference audio",
+)
+@click.option(
+    "-m",
+    "--modelPath",
+    is_flag=False,
+    type=str,
+    help="Model path or filename (reads in the directory set in env)",
+    required=True,
+)
+@click.option(
+    "-i",
+    "--inputPath",
+    is_flag=False,
+    type=Path,
+    help="input audio path or folder",
+    required=True,
+)
+@click.option(
+    "-o",
+    "--outputPath",
+    is_flag=False,
+    type=Path,
+    help="output audio path or folder",
+    required=True,
+)
+@click.option(
+    "-s", "--sid", is_flag=False, type=int, help="Speaker/Singer id", default=0
+)
+@click.option("-fu", "--f0upkey", is_flag=False, type=int, help="Transpose", default=0)
+@click.option(
+    "-fm",
+    "--f0method",
+    is_flag=False,
+    type=str,
+    help="Pitch extraction algorith",
+    default="rmvpe",
+)
+@click.option(
+    "-ff", "--f0file", is_flag=False, type=Path, help="F0 curve file (optional)"
+)
+@click.option("-if", "--indexFile", is_flag=False, type=Path, help="Feature index file")
+@click.option(
+    "-ir",
+    "--indexRate",
+    is_flag=False,
+    type=float,
+    help="Search feature ratio",
+    default=0.75,
+)
+@click.option(
+    "-fr",
+    "--filterRadius",
+    is_flag=False,
+    type=int,
+    help="Apply median filtering",
+    default=3,
+)
+@click.option(
+    "-rsr",
+    "--resamplesr",
+    is_flag=False,
+    type=int,
+    help="Resample the output audio",
+    default=0,
+)
+@click.option(
+    "-rmr",
+    "--rmsmixrate",
+    is_flag=False,
+    type=float,
+    help="Adjust the volume envelope scaling",
+    default=0.25,
+)
+@click.option(
+    "-p",
+    "--protect",
+    is_flag=False,
+    type=float,
+    help="Protect voiceless consonants and breath sounds",
+    default=0.33,
+)
+def infer(
+    modelpath,
+    inputpath,
+    outputpath,
+    sid,
+    f0upkey,
+    f0method,
+    f0file,
+    indexfile,
+    indexrate,
+    filterradius,
+    resamplesr,
+    rmsmixrate,
+    protect,
+):
+    load_dotenv()
+    vc = VC()
+    vc.get_vc(modelpath)
+    tgt_sr, audio_opt, times, _ = vc.vc_single(
+        sid,
+        inputpath,
+        f0upkey,
+        f0method,
+        f0file,
+        indexfile,
+        indexrate,
+        filterradius,
+        resamplesr,
+        rmsmixrate,
+        protect,
+    )
+    wavfile.write(outputpath, tgt_sr, audio_opt)
+    click.echo(times)
+    click.echo(f"Finish inference. Check {outputpath}")
@@ -0,0 +1,6 @@
+import click
+
+
+@click.command()
+def train():
+    pass
@@ -0,0 +1,6 @@
+import click
+
+
+@click.command()
+def uvr():
+    pass
@@ -0,0 +1,8 @@
+import urllib
+import click
+
+
+@click.command()
+def dlmodel() -> None:
+    # Download models [harvest, uvr5, and more ]
+    pass
@@ -0,0 +1,13 @@
+"""
+setup or cleanup enviroment file
+usage: rvc env [set / cleanup]
+Default: [nowDir/.env]
+
+"""
+
+import click
+
+
+@click.command()
+def env():
+    pass
@@ -0,0 +1,14 @@
+"""
+Uage: rvc init
+download model and setup environmmnt file
+
+"""
+import click
+
+
+import click
+
+
+@click.command()
+def initialize():
+    pass
@@ -82,7 +82,8 @@ class Config:
            action="store_true",
            help="torch_dml",
        )
-        cmd_opts: argparse.Namespace = parser.parse_args()
+        cmd_opts: argparse.Namespace
+        cmd_opts, _ = parser.parse_known_args()

        cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865

@@ -87,7 +87,7 @@ class UVR:
                export_format,
                is_hp3=is_hp3,
            )
-            infos.append(f"{os.path.basename(process_path)}->Success" )
+            infos.append(f"{os.path.basename(process_path)}->Success")
            yield "\n".join(infos)
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
@@ -45,7 +45,7 @@ class VC:
            to_return_protect[1] if self.if_f0 != 0 and to_return_protect else 0.33,
        ]

-        person = f'{os.getenv("weight_root")}/{sid}'
+        person = sid if os.path.exists(sid) else f'{os.getenv("weight_root")}/{sid}'
        logger.info(f"Loading: {person}")

        self.cpt = torch.load(person, map_location="cpu")