New Feature:

- Add Crepe Full/Tiny (onnx) - remove test connect for local Refactor: - RVC: comment out module importer
2023-07-07 02:17:29 +09:00 · 2023-07-07 02:17:29 +09:00 · d5561c2212
commit d5561c2212
parent 099f82cc60
32 changed files with 2068 additions and 117 deletions
--- a/client/demo/dist/assets/gui_settings/GUI.json
+++ b/client/demo/dist/assets/gui_settings/GUI.json
@ -21,7 +21,7 @@
            {
                "name": "configArea",
                "options": {
-                    "detectors": ["dio", "harvest", "crepe"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
                    "inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
                }
            }
--- a/client/demo/dist/index.html
+++ b/client/demo/dist/index.html
@ -1 +1,10 @@
-<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
+<!DOCTYPE html>
+<html style="width: 100%; height: 100%; overflow: hidden">
+    <head>
+        <meta charset="utf-8" />
+        <title>Voice Changer Client Demo</title>
+    <script defer src="index.js"></script></head>
+    <body style="width: 100%; height: 100%; margin: 0px">
+        <div id="app" style="width: 100%; height: 100%"></div>
+    </body>
+</html>
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/dist/index.js.LICENSE.txt
+++ b/client/demo/dist/index.js.LICENSE.txt
@ -1,31 +0,0 @@
-/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
-
-/**
- * @license React
- * react-dom.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @license React
- * react.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @license React
- * scheduler.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
--- a/client/demo/public/assets/gui_settings/GUI.json
+++ b/client/demo/public/assets/gui_settings/GUI.json
@ -21,7 +21,7 @@
            {
                "name": "configArea",
                "options": {
-                    "detectors": ["dio", "harvest", "crepe"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
                    "inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
                }
            }
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -42,8 +42,9 @@ export type CrossFadeOverlapSize = typeof CrossFadeOverlapSize[keyof typeof Cros
 export const F0Detector = {
    "dio": "dio",
    "harvest": "harvest",
-    // "parselmouth": "parselmouth",
    "crepe": "crepe",
+    "crepe_full": "crepe_full",
+    "crepe_tiny": "crepe_tiny",
 } as const
 export type F0Detector = typeof F0Detector[keyof typeof F0Detector]

--- a/server/MMVCServerSIO.py
+++ b/server/MMVCServerSIO.py
@ -34,6 +34,7 @@ def setupArgParser():
    parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
    parser.add_argument("-p", type=int, default=18888, help="port")
    parser.add_argument("--https", type=strtobool, default=False, help="use https")
+    parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8")
    parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
    parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
    parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
@ -48,6 +49,8 @@ def setupArgParser():
    parser.add_argument("--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)")
    parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)")
    parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)")
+    parser.add_argument("--crepe_onnx_full", type=str, help="path to crepe_onnx_full")
+    parser.add_argument("--crepe_onnx_tiny", type=str, help="path to crepe_onnx_tiny")

    return parser

@ -85,6 +88,9 @@ voiceChangerParams = VoiceChangerParams(
    hubert_base_jp=args.hubert_base_jp,
    hubert_soft=args.hubert_soft,
    nsf_hifigan=args.nsf_hifigan,
+    crepe_onnx_full=args.crepe_onnx_full,
+    crepe_onnx_tiny=args.crepe_onnx_tiny,
+    
    sample_mode=args.sample_mode,
 )

@ -120,6 +126,7 @@ if __name__ == "__mp_main__":
 if __name__ == "__main__":
    mp.freeze_support()

+    printMessage(f"PYTHON:{sys.version}", level=2)
    printMessage("Voice Changerを起動しています。", level=2)
    # ダウンロード(Weight)
    try:
@ -195,10 +202,10 @@ if __name__ == "__main__":
        else:
            printMessage(f"http://localhost:{EX_PORT}/", level=1)
    else:  # 直接python起動
-        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        s.connect(("8.8.8.8", 80))
-        hostname = s.getsockname()[0]
        if args.https == 1:
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            s.connect((args.test_connect, 80))
+            hostname = s.getsockname()[0]
            printMessage(f"https://localhost:{PORT}/", level=1)
            printMessage(f"https://{hostname}:{PORT}/", level=1)
        else:
--- a/server/const.py
+++ b/server/const.py
@ -73,15 +73,13 @@ class EnumInferenceTypes(Enum):
    onnxRVCNono = "onnxRVCNono"


-class EnumPitchExtractorTypes(Enum):
-    harvest = "harvest"
-    dio = "dio"
-    crepe = "crepe"
-
-
-class EnumFrameworkTypes(Enum):
-    pyTorch = "pyTorch"
-    onnx = "onnx"
+PitchExtractorType: TypeAlias = Literal[
+    "harvest",
+    "dio",
+    "crepe",
+    "crepe_full",
+    "crepe_tiny",
+]


 class ServerAudioDeviceTypes(Enum):
--- a/server/downloader/WeightDownloader.py
+++ b/server/downloader/WeightDownloader.py
@ -11,6 +11,8 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
    hubert_base_jp = voiceChangerParams.hubert_base_jp
    hubert_soft = voiceChangerParams.hubert_soft
    nsf_hifigan = voiceChangerParams.nsf_hifigan
+    crepe_onnx_full = voiceChangerParams.crepe_onnx_full
+    crepe_onnx_tiny = voiceChangerParams.crepe_onnx_tiny

    # file exists check (currently only for rvc)
    downloadParams = []
@ -57,6 +59,24 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
            }
        )

+    if os.path.exists(crepe_onnx_full) is False:
+        downloadParams.append(
+            {
+                "url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/full.onnx",
+                "saveTo": crepe_onnx_full,
+                "position": 5,
+            }
+        )
+    
+    if os.path.exists(crepe_onnx_tiny) is False:
+        downloadParams.append(
+            {
+                "url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/tiny.onnx",
+                "saveTo": crepe_onnx_tiny,
+                "position": 6,
+            }
+        )
+
    with ThreadPoolExecutor() as pool:
        pool.map(download, downloadParams)

--- a/server/voice_changer/MMVCv15/MMVCv15.py
+++ b/server/voice_changer/MMVCv15/MMVCv15.py
@ -300,5 +300,5 @@ class MMVCv15:
                if file_path.find(remove_path + os.path.sep) >= 0:
                    # print("remove", key, file_path)
                    sys.modules.pop(key)
-            except:  # type:ignore
+            except: # NOQA
                pass
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -1,5 +1,5 @@
-import sys
-import os
+# import sys
+# import os
 from dataclasses import asdict
 import numpy as np
 import torch
@ -7,18 +7,18 @@ import torchaudio
 from data.ModelSlot import RVCModelSlot


-# avoiding parse arg error in RVC
-sys.argv = ["MMVCServerSIO.py"]
+# # avoiding parse arg error in RVC
+# sys.argv = ["MMVCServerSIO.py"]

-if sys.platform.startswith("darwin"):
-    baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
-    if len(baseDir) != 1:
-        print("baseDir should be only one ", baseDir)
-        sys.exit()
-    modulePath = os.path.join(baseDir[0], "RVC")
-    sys.path.append(modulePath)
-else:
-    sys.path.append("RVC")
+# if sys.platform.startswith("darwin"):
+#     baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
+#     if len(baseDir) != 1:
+#         print("baseDir should be only one ", baseDir)
+#         sys.exit()
+#     modulePath = os.path.join(baseDir[0], "RVC")
+#     sys.path.append(modulePath)
+# else:
+#     sys.path.append("RVC")


 from voice_changer.RVC.RVCSettings import RVCSettings
@ -39,9 +39,10 @@ class RVC(VoiceChangerModel):
        print("[Voice Changer] [RVC] Creating instance ")
        self.deviceManager = DeviceManager.get_instance()
        EmbedderManager.initialize(params)
+        PitchExtractorManager.initialize(params)
        self.settings = RVCSettings()
        self.params = params
-        self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
+        self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)

        self.pipeline: Pipeline | None = None

@ -76,7 +77,7 @@ class RVC(VoiceChangerModel):
        elif key in self.settings.strData:
            setattr(self.settings, key, str(val))
            if key == "f0Detector" and self.pipeline is not None:
-                pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
+                pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
                self.pipeline.setPitchExtractor(pitchExtractor)
        else:
            return False
@ -112,7 +113,7 @@ class RVC(VoiceChangerModel):
            self.audio_buffer = newData
            if self.slotInfo.f0:
                self.pitchf_buffer = np.zeros(new_feature_length)
-            self.feature_buffer =  np.zeros([new_feature_length, self.slotInfo.embChannels])
+            self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])

        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize

@ -201,21 +202,21 @@ class RVC(VoiceChangerModel):
    def __del__(self):
        del self.pipeline

-        print("---------- REMOVING ---------------")
+        # print("---------- REMOVING ---------------")

-        remove_path = os.path.join("RVC")
-        sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
+        # remove_path = os.path.join("RVC")
+        # sys.path = [x for x in sys.path if x.endswith(remove_path) is False]

-        for key in list(sys.modules):
-            val = sys.modules.get(key)
-            try:
-                file_path = val.__file__
-                if file_path.find("RVC" + os.path.sep) >= 0:
-                    # print("remove", key, file_path)
-                    sys.modules.pop(key)
-            except Exception:  # type:ignore
-                # print(e)
-                pass
+        # for key in list(sys.modules):
+        #     val = sys.modules.get(key)
+        #     try:
+        #         file_path = val.__file__
+        #         if file_path.find("RVC" + os.path.sep) >= 0:
+        #             # print("remove", key, file_path)
+        #             sys.modules.pop(key)
+        #     except Exception:  # type:ignore
+        #         # print(e)
+        #         pass

    def export2onnx(self):
        modelSlot = self.slotInfo
--- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
+++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
@ -4,6 +4,7 @@ from const import EnumInferenceTypes

 from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer

+
 class OnnxRVCInferencerNono(OnnxRVCInferencer):
    def loadModel(self, file: str, gpu: int):
        super().loadModel(file, gpu)
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -73,9 +73,9 @@ class Pipeline(object):
    def exec(
        self,
        sid,
-        audio, # torch.tensor [n]
-        pitchf, # np.array [m]
-        feature, # np.array [m, feat]
+        audio,  # torch.tensor [n]
+        pitchf,  # np.array [m]
+        feature,  # np.array [m, feat]
        f0_up_key,
        index_rate,
        if_f0,
@ -208,14 +208,13 @@ class Pipeline(object):
        # apply silent front for inference
        if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
            npyOffset = math.floor(silence_front * 16000) // 360
-            feats = feats[:, npyOffset * 2 :, :]
+            feats = feats[:, npyOffset * 2 :, :]  # NOQA
            feats_len = feats.shape[1]
            if pitch is not None and pitchf is not None:
                pitch = pitch[:, -feats_len:]
                pitchf = pitchf[:, -feats_len:]
            p_len = torch.tensor([feats_len], device=self.device).long()

-
        # 推論実行
        try:
            with torch.no_grad():
--- a/server/voice_changer/RVC/pipeline/PipelineGenerator.py
+++ b/server/voice_changer/RVC/pipeline/PipelineGenerator.py
@ -34,7 +34,7 @@ def createPipeline(modelSlot: RVCModelSlot, gpu: int, f0Detector: str):
        traceback.print_exc()

    # pitchExtractor
-    pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector)
+    pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)

    # index, feature
    index = _loadIndex(modelSlot)
@ -67,7 +67,7 @@ def _loadIndex(modelSlot: RVCModelSlot):
    try:
        print("Try loading...", modelSlot.indexFile)
        index = faiss.read_index(modelSlot.indexFile)
-    except:
+    except: # NOQA
        print("[Voice Changer] load index failed. Use no index.")
        traceback.print_exc()
        return None
--- a/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
@ -0,0 +1,68 @@
+import numpy as np
+from const import PitchExtractorType
+from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
+from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
+import onnxruntime
+from voice_changer.RVC.pitchExtractor import onnxcrepe
+
+
+class CrepeOnnxPitchExtractor(PitchExtractor):
+
+    def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
+        self.pitchExtractorType = pitchExtractorType
+        super().__init__()
+        (
+            onnxProviders,
+            onnxProviderOptions,
+        ) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
+
+        self.onnx_session = onnxruntime.InferenceSession(
+            file, providers=onnxProviders, provider_options=onnxProviderOptions
+        )
+
+    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
+        n_frames = int(len(audio) // window) + 1
+        start_frame = int(silence_front * sr / window)
+        real_silence_front = start_frame * window / sr
+
+        silence_front_offset = int(np.round(real_silence_front * sr))
+        audio = audio[silence_front_offset:]
+
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
+        precision = 10.0
+
+        audio_num = audio.cpu()
+        onnx_f0, onnx_pd = onnxcrepe.predict(
+            self.onnx_session,
+            audio_num,
+            sr,
+            precision=precision,
+            fmin=f0_min,
+            fmax=f0_max,
+            batch_size=256,
+            return_periodicity=True,
+            decoder=onnxcrepe.decode.weighted_argmax,
+            )
+
+        f0 = onnxcrepe.filter.median(onnx_f0, 3)
+        pd = onnxcrepe.filter.median(onnx_pd, 3)
+
+        f0[pd < 0.1] = 0
+        f0 = f0.squeeze()
+
+        f0 = np.pad(f0, (start_frame, n_frames - f0.shape[0] - start_frame), 'constant', constant_values=(0, 0))
+
+        f0 *= pow(2, f0_up_key / 12)
+        pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
+        f0bak = pitchf.copy()
+        f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
+        f0_mel = np.clip(
+            (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
+        )
+        pitch_coarse = f0_mel.astype(int)
+
+        return pitch_coarse, pitchf
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@ -1,16 +1,16 @@
 import torchcrepe
 import torch
 import numpy as np
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType

 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor


 class CrepePitchExtractor(PitchExtractor):
-    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.crepe

    def __init__(self):
        super().__init__()
+        self.pitchExtractorType: PitchExtractorType = "crepe"
        if torch.cuda.is_available():
            self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
        else:
--- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
@ -1,16 +1,19 @@
 import pyworld
 import numpy as np
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType

 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor


 class DioPitchExtractor(PitchExtractor):
-    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio
+
+    def __init__(self):
+        super().__init__()
+        self.pitchExtractorType: PitchExtractorType = "dio"

    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
        audio = audio.detach().cpu().numpy()
-        n_frames = int(len(audio) // window) + 1
+        n_frames = int(len(audio) // window) + 1  # NOQA
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr

@ -31,7 +34,7 @@ class DioPitchExtractor(PitchExtractor):
            frame_period=10,
        )
        f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
-       # f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
+        # f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))

        f0 *= pow(2, f0_up_key / 12)
        pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
@ -45,4 +48,3 @@ class DioPitchExtractor(PitchExtractor):
        pitch_coarse = np.rint(f0_mel).astype(int)

        return pitch_coarse, pitchf
-
--- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
@ -1,17 +1,20 @@
 import pyworld
 import numpy as np
 import scipy.signal as signal
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType

 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor


 class HarvestPitchExtractor(PitchExtractor):
-    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
+
+    def __init__(self):
+        super().__init__()
+        self.pitchExtractorType: PitchExtractorType = "harvest"

    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
        audio = audio.detach().cpu().numpy()
-        n_frames = int(len(audio) // window) + 1
+        n_frames = int(len(audio) // window) + 1  # NOQA
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr

--- a/server/voice_changer/RVC/pitchExtractor/PitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractor.py
@ -1,14 +1,12 @@
 from typing import Protocol
-from const import EnumPitchExtractorTypes


 class PitchExtractor(Protocol):
-    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest

    def extract(self, audio, f0_up_key, sr, window, silence_front=0):
        ...

    def getPitchExtractorInfo(self):
        return {
-            "pitchExtractorType": self.pitchExtractorType.value,
+            "pitchExtractorType": self.pitchExtractorType,
        }
--- a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py
+++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py
@ -1,40 +1,42 @@
 from typing import Protocol
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType
+from voice_changer.RVC.pitchExtractor.CrepeOnnxPitchExtractor import CrepeOnnxPitchExtractor
 from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
 from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
 from voice_changer.RVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
+from voice_changer.utils.VoiceChangerParams import VoiceChangerParams


 class PitchExtractorManager(Protocol):
    currentPitchExtractor: PitchExtractor | None = None
+    params: VoiceChangerParams
+
+    @classmethod
+    def initialize(cls, params: VoiceChangerParams):
+        cls.params = params

    @classmethod
    def getPitchExtractor(
-        cls, pitchExtractorType: EnumPitchExtractorTypes
+        cls, pitchExtractorType: PitchExtractorType, gpu: int
    ) -> PitchExtractor:
-        cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType)
+        cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType,  gpu)
        return cls.currentPitchExtractor

    @classmethod
    def loadPitchExtractor(
-        cls, pitchExtractorType: EnumPitchExtractorTypes
+        cls, pitchExtractorType: PitchExtractorType, gpu: int
    ) -> PitchExtractor:
-        if (
-            pitchExtractorType == EnumPitchExtractorTypes.harvest
-            or pitchExtractorType == EnumPitchExtractorTypes.harvest.value
-        ):
+        if pitchExtractorType == "harvest":
            return HarvestPitchExtractor()
-        elif (
-            pitchExtractorType == EnumPitchExtractorTypes.dio
-            or pitchExtractorType == EnumPitchExtractorTypes.dio.value
-        ):
+        elif pitchExtractorType == "dio":
            return DioPitchExtractor()
-        elif (
-            pitchExtractorType == EnumPitchExtractorTypes.crepe
-            or pitchExtractorType == EnumPitchExtractorTypes.crepe.value
-        ):
+        elif pitchExtractorType == "crepe":
            return CrepePitchExtractor()
+        elif pitchExtractorType == "crepe_tiny":
+            return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
+        elif pitchExtractorType == "crepe_full":
+            return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
        else:
            # return hubert as default
            raise RuntimeError(
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/init.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/init.py
@ -0,0 +1,8 @@
+from . import decode  # NOQA
+from .core import *  # NOQA
+from . import convert  # NOQA
+from . import filter  # NOQA
+from . import load  # NOQA
+from . import loudness  # NOQA
+from .session import CrepeInferenceSession  # NOQA
+from . import threshold  # NOQA
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
@ -0,0 +1,57 @@
+import numpy as np
+import scipy
+
+from voice_changer.RVC.pitchExtractor import onnxcrepe
+
+
+###############################################################################
+# Pitch unit conversions
+###############################################################################
+
+
+def bins_to_cents(bins, apply_dither=False):
+    """Converts pitch bins to cents"""
+    cents = onnxcrepe.CENTS_PER_BIN * bins + 1997.3794084376191
+
+    # Trade quantization error for noise (disabled by default)
+    return dither(cents) if apply_dither else cents
+
+
+def bins_to_frequency(bins, apply_dither=False):
+    """Converts pitch bins to frequency in Hz"""
+    return cents_to_frequency(bins_to_cents(bins, apply_dither=apply_dither))
+
+
+def cents_to_bins(cents, quantize_fn=np.floor):
+    """Converts cents to pitch bins"""
+    bins = (cents - 1997.3794084376191) / onnxcrepe.CENTS_PER_BIN
+    return quantize_fn(bins).astype(np.int64)
+
+
+def cents_to_frequency(cents):
+    """Converts cents to frequency in Hz"""
+    return 10 * 2 ** (cents / 1200)
+
+
+def frequency_to_bins(frequency, quantize_fn=np.floor):
+    """Convert frequency in Hz to pitch bins"""
+    return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
+
+
+def frequency_to_cents(frequency):
+    """Convert frequency in Hz to cents"""
+    return 1200 * np.log2(frequency / 10.)
+
+
+###############################################################################
+# Utilities
+###############################################################################
+
+
+def dither(cents):
+    """Dither the predicted pitch in cents to remove quantization error"""
+    noise = scipy.stats.triang.rvs(c=0.5,
+                                   loc=-onnxcrepe.CENTS_PER_BIN,
+                                   scale=2 * onnxcrepe.CENTS_PER_BIN,
+                                   size=cents.shape)
+    return cents + noise.astype(cents.dtype)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
@ -0,0 +1,256 @@
+import librosa
+import numpy as np
+
+from voice_changer.RVC.pitchExtractor import onnxcrepe
+
+__all__ = ['CENTS_PER_BIN',
+           'MAX_FMAX',
+           'PITCH_BINS',
+           'SAMPLE_RATE',
+           'WINDOW_SIZE',
+           'UNVOICED',
+           'predict',
+           'preprocess',
+           'infer',
+           'postprocess',
+           'resample']
+
+###############################################################################
+# Constants
+###############################################################################
+
+
+CENTS_PER_BIN = 20  # cents
+MAX_FMAX = 2006.  # hz
+PITCH_BINS = 360
+SAMPLE_RATE = 16000  # hz
+WINDOW_SIZE = 1024  # samples
+UNVOICED = np.nan
+
+
+###############################################################################
+# Crepe pitch prediction
+###############################################################################
+
+
+def predict(session,
+            audio,
+            sample_rate,
+            precision=None,
+            fmin=50.,
+            fmax=MAX_FMAX,
+            decoder=onnxcrepe.decode.weighted_viterbi,
+            return_periodicity=False,
+            batch_size=None,
+            pad=True):
+    """Performs pitch estimation
+
+    Arguments
+        session (onnxcrepe.CrepeInferenceSession)
+            An onnxruntime.InferenceSession holding the CREPE model
+        audio (numpy.ndarray [shape=(n_samples,)])
+            The audio signal
+        sample_rate (int)
+            The sampling rate in Hz
+        precision (float)
+            The precision in milliseconds, i.e. the length of each frame
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        decoder (function)
+            The decoder to use. See decode.py for decoders.
+        return_periodicity (bool)
+            Whether to also return the network confidence
+        batch_size (int)
+            The number of frames per batch
+        pad (bool)
+            Whether to zero-pad the audio
+
+    Returns
+        pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
+        (Optional) periodicity (numpy.ndarray
+                                [shape=(1, 1 + int(time // precision))])
+    """
+
+    results = []
+
+    # Preprocess audio
+    generator = preprocess(audio,
+                           sample_rate,
+                           precision,
+                           batch_size,
+                           pad)
+    for frames in generator:
+
+        # Infer independent probabilities for each pitch bin
+        probabilities = infer(session, frames)  # shape=(batch, 360)
+
+        probabilities = probabilities.transpose(1, 0)[None]  # shape=(1, 360, batch)
+
+        # Convert probabilities to F0 and periodicity
+        result = postprocess(probabilities,
+                             fmin,
+                             fmax,
+                             decoder,
+                             return_periodicity)
+
+        # Place on same device as audio to allow very long inputs
+        if isinstance(result, tuple):
+            result = (result[0], result[1])
+
+        results.append(result)
+
+    # Split pitch and periodicity
+    if return_periodicity:
+        pitch, periodicity = zip(*results)
+        return np.concatenate(pitch, axis=1), np.concatenate(periodicity, axis=1)
+
+    # Concatenate
+    return np.concatenate(results, axis=1)
+
+
+def preprocess(audio,
+               sample_rate,
+               precision=None,
+               batch_size=None,
+               pad=True):
+    """Convert audio to model input
+
+    Arguments
+        audio (numpy.ndarray [shape=(time,)])
+            The audio signals
+        sample_rate (int)
+            The sampling rate in Hz
+        precision (float)
+            The precision in milliseconds, i.e. the length of each frame
+        batch_size (int)
+            The number of frames per batch
+        pad (bool)
+            Whether to zero-pad the audio
+
+    Returns
+        frames (numpy.ndarray [shape=(1 + int(time // precision), 1024)])
+    """
+    # Resample
+    if sample_rate != SAMPLE_RATE:
+        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
+
+    # Default hop length of 10 ms
+    hop_length = SAMPLE_RATE / 100 if precision is None else SAMPLE_RATE * precision / 1000
+
+    # Get total number of frames
+
+    # Maybe pad
+    if pad:
+        total_frames = 1 + int(audio.shape[0] / hop_length)
+        audio = np.pad(
+            audio,
+            (WINDOW_SIZE // 2, WINDOW_SIZE // 2))
+    else:
+        total_frames = 1 + int((audio.shape[0] - WINDOW_SIZE) / hop_length)
+
+    # Default to running all frames in a single batch
+    batch_size = total_frames if batch_size is None else batch_size
+
+    # Generate batches
+    for i in range(0, total_frames, batch_size):
+        # Batch indices
+        start = max(0, int(i * hop_length))
+        end = min(audio.shape[0],
+                  int((i + batch_size - 1) * hop_length) + WINDOW_SIZE)
+
+        # Chunk
+        n_bytes = audio.strides[-1]
+        frames = np.lib.stride_tricks.as_strided(
+            audio[start:end],
+            shape=((end - start - WINDOW_SIZE) // int(hop_length) + 1, WINDOW_SIZE),
+            strides=(int(hop_length) * n_bytes, n_bytes))  # shape=(batch, 1024)
+
+        # Note:
+        # Z-score standardization operations originally located here
+        # (https://github.com/maxrmorrison/torchcrepe/blob/master/torchcrepe/core.py#L692)
+        # are wrapped into the ONNX models for hardware acceleration.
+
+        yield frames
+
+
+def infer(session, frames):
+    """Forward pass through the model
+
+    Arguments
+        session (onnxcrepe.CrepeInferenceSession)
+            An onnxruntime.InferenceSession holding the CREPE model
+        frames (numpy.ndarray [shape=(time / precision, 1024)])
+            The network input
+
+    Returns
+        logits (numpy.ndarray [shape=(1 + int(time // precision), 360)])
+    """
+    # Apply model
+    return session.run(None, {'frames': frames})[0]
+
+
+def postprocess(probabilities,
+                fmin=0.,
+                fmax=MAX_FMAX,
+                decoder=onnxcrepe.decode.weighted_viterbi,
+                return_periodicity=False):
+    """Convert model output to F0 and periodicity
+
+    Arguments
+        probabilities (numpy.ndarray [shape=(1, 360, time / precision)])
+            The probabilities for each pitch bin inferred by the network
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        decoder (function)
+            The decoder to use. See decode.py for decoders.
+        return_periodicity (bool)
+            Whether to also return the network confidence
+
+    Returns
+        pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
+        periodicity (numpy.ndarray [shape=(1, 1 + int(time // precision))])
+    """
+    # Convert frequency range to pitch bin range
+    minidx = onnxcrepe.convert.frequency_to_bins(fmin)
+    maxidx = onnxcrepe.convert.frequency_to_bins(fmax, np.ceil)
+
+    # Remove frequencies outside allowable range
+    probabilities[:, :minidx] = float('-inf')
+    probabilities[:, maxidx:] = float('-inf')
+
+    # Perform argmax or viterbi sampling
+    bins, pitch = decoder(probabilities)
+
+    if not return_periodicity:
+        return pitch
+
+    # Compute periodicity from probabilities and decoded pitch bins
+    return pitch, periodicity(probabilities, bins)
+
+
+###############################################################################
+# Utilities
+###############################################################################
+
+
+def periodicity(probabilities, bins):
+    """Computes the periodicity from the network output and pitch bins"""
+    # shape=(time / precision, 360)
+    probs_stacked = probabilities.transpose(0, 2, 1).reshape(-1, PITCH_BINS)
+    # shape=(time / precision, 1)
+    bins_stacked = bins.reshape(-1, 1).astype(np.int64)
+
+    # Use maximum logit over pitch bins as periodicity
+    periodicity = np.take_along_axis(probs_stacked, bins_stacked, axis=1)
+
+    # shape=(batch, time / precision)
+    return periodicity.reshape(probabilities.shape[0], probabilities.shape[2])
+
+
+def resample(audio, sample_rate):
+    """Resample audio"""
+    return librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
@ -0,0 +1,80 @@
+import librosa
+import numpy as np
+
+from voice_changer.RVC.pitchExtractor import onnxcrepe
+
+###############################################################################
+# Probability sequence decoding methods
+###############################################################################
+
+
+def argmax(logits):
+    """Sample observations by taking the argmax"""
+    bins = logits.argmax(axis=1)
+
+    # Convert to frequency in Hz
+    return bins, onnxcrepe.convert.bins_to_frequency(bins)
+
+
+def weighted_argmax(logits: np.ndarray):
+    """Sample observations using weighted sum near the argmax"""
+    # Find center of analysis window
+    bins = logits.argmax(axis=1)
+
+    return bins, _apply_weights(logits, bins)
+
+
+def viterbi(logits):
+    """Sample observations using viterbi decoding"""
+    # Create viterbi transition matrix
+    if not hasattr(viterbi, 'transition'):
+        xx, yy = np.meshgrid(range(360), range(360))
+        transition = np.maximum(12 - abs(xx - yy), 0)
+        transition = transition / transition.sum(axis=1, keepdims=True)
+        viterbi.transition = transition
+
+    # Normalize logits (softmax)
+    logits -= logits.max(axis=1)
+    exp = np.exp(logits)
+    probs = exp / np.sum(exp, axis=1)
+
+    # Perform viterbi decoding
+    bins = np.array([
+        librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
+        for sequence in probs])
+
+    # Convert to frequency in Hz
+    return bins, onnxcrepe.convert.bins_to_frequency(bins)
+
+
+def weighted_viterbi(logits):
+    """Sample observations combining viterbi decoding and weighted argmax"""
+    bins, _ = viterbi(logits)
+
+    return bins, _apply_weights(logits, bins)
+
+
+def _apply_weights(logits, bins):
+    # Find bounds of analysis window
+    start = np.maximum(0, bins - 4)
+    end = np.minimum(logits.shape[1], bins + 5)
+
+    # Mask out everything outside of window
+    for batch in range(logits.shape[0]):
+        for time in range(logits.shape[2]):
+            logits[batch, :start[batch, time], time] = float('-inf')
+            logits[batch, end[batch, time]:, time] = float('-inf')
+
+    # Construct weights
+    if not hasattr(_apply_weights, 'weights'):
+        weights = onnxcrepe.convert.bins_to_cents(np.arange(360))
+        _apply_weights.weights = weights[None, :, None]
+
+    # Convert to probabilities (ReLU)
+    probs = np.maximum(0, logits)
+
+    # Apply weights
+    cents = (_apply_weights.weights * probs).sum(axis=1) / probs.sum(axis=1)
+
+    # Convert to frequency in Hz
+    return onnxcrepe.convert.cents_to_frequency(cents)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
@ -0,0 +1,125 @@
+import numpy as np
+
+
+###############################################################################
+# Sequence filters
+###############################################################################
+
+
+def mean(signals, win_length=9):
+    """Averave filtering for signals containing nan values
+
+    Arguments
+        signals (numpy.ndarray (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+
+    Returns
+        filtered (numpy.ndarray (shape=(batch, time)))
+    """
+    return nanfilter(signals, win_length, nanmean)
+
+
+def median(signals, win_length):
+    """Median filtering for signals containing nan values
+
+    Arguments
+        signals (numpy.ndarray (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+
+    Returns
+        filtered (numpy.ndarray (shape=(batch, time)))
+    """
+    return nanfilter(signals, win_length, nanmedian)
+
+
+###############################################################################
+# Utilities
+###############################################################################
+
+
+def nanfilter(signals, win_length, filter_fn):
+    """Filters a sequence, ignoring nan values
+
+    Arguments
+        signals (numpy.ndarray (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+        filter_fn (function)
+            The function to use for filtering
+
+    Returns
+        filtered (numpy.ndarray (shape=(batch, time)))
+    """
+    # Output buffer
+    filtered = np.empty_like(signals)
+
+    # Loop over frames
+    for i in range(signals.shape[1]):
+
+        # Get analysis window bounds
+        start = max(0, i - win_length // 2)
+        end = min(signals.shape[1], i + win_length // 2 + 1)
+
+        # Apply filter to window
+        filtered[:, i] = filter_fn(signals[:, start:end])
+
+    return filtered
+
+
+def nanmean(signals):
+    """Computes the mean, ignoring nans
+
+    Arguments
+        signals (numpy.ndarray [shape=(batch, time)])
+            The signals to filter
+
+    Returns
+        filtered (numpy.ndarray [shape=(batch, time)])
+    """
+    signals = signals.clone()
+
+    # Find nans
+    nans = np.isnan(signals)
+
+    # Set nans to 0.
+    signals[nans] = 0.
+
+    # Compute average
+    return signals.sum(axis=1) / (~nans).astype(np.float32).sum(axis=1)
+
+
+def nanmedian(signals):
+    """Computes the median, ignoring nans
+
+    Arguments
+        signals (numpy.ndarray [shape=(batch, time)])
+            The signals to filter
+
+    Returns
+        filtered (numpy.ndarray [shape=(batch, time)])
+    """
+    # Find nans
+    nans = np.isnan(signals)
+
+    # Compute median for each slice
+    medians = [nanmedian1d(signal[~nan]) for signal, nan in zip(signals, nans)]
+
+    # Stack results
+    return np.array(medians, dtype=signals.dtype)
+
+
+def nanmedian1d(signal):
+    """Computes the median. If signal is empty, returns torch.nan
+
+    Arguments
+        signal (numpy.ndarray [shape=(time,)])
+
+    Returns
+        median (numpy.ndarray [shape=(1,)])
+    """
+    return np.median(signal) if signal.size else np.nan
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
@ -0,0 +1,12 @@
+import librosa
+import numpy as np
+
+
+def audio(filename):
+    """Load audio from disk"""
+    samples, sr = librosa.load(filename, sr=None)
+    if len(samples.shape) > 1:
+        # To mono
+        samples = np.mean(samples, axis=1)
+
+    return samples, sr
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/loudness.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/loudness.py
@ -0,0 +1,73 @@
+import warnings
+
+import librosa
+import numpy as np
+from voice_changer.RVC.pitchExtractor import onnxcrepe
+
+
+###############################################################################
+# Constants
+###############################################################################
+
+
+# Minimum decibel level
+MIN_DB = -100.
+
+# Reference decibel level
+REF_DB = 20.
+
+
+###############################################################################
+# A-weighted loudness
+###############################################################################
+
+
+def a_weighted(audio, sample_rate, hop_length=None, pad=True):
+    """Retrieve the per-frame loudness"""
+
+    # Default hop length of 10 ms
+    hop_length = sample_rate // 100 if hop_length is None else hop_length
+
+    # Convert to numpy
+    audio = audio.squeeze(0)
+
+    # Resample
+    if sample_rate != onnxcrepe.SAMPLE_RATE:
+        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
+        hop_length = int(hop_length * onnxcrepe.SAMPLE_RATE / sample_rate)
+
+    # Cache weights
+    if not hasattr(a_weighted, 'weights'):
+        a_weighted.weights = perceptual_weights()
+
+    # Take stft
+    stft = librosa.stft(audio,
+                        n_fft=onnxcrepe.WINDOW_SIZE,
+                        hop_length=hop_length,
+                        win_length=onnxcrepe.WINDOW_SIZE,
+                        center=pad,
+                        pad_mode='constant')
+
+    # Compute magnitude on db scale
+    db = librosa.amplitude_to_db(np.abs(stft))
+
+    # Apply A-weighting
+    weighted = db + a_weighted.weights
+
+    # Threshold
+    weighted[weighted < MIN_DB] = MIN_DB
+
+    # Average over weighted frequencies
+    return weighted.mean(axis=0).astype(np.float32)[None]
+
+
+def perceptual_weights():
+    """A-weighted frequency-dependent perceptual loudness weights"""
+    frequencies = librosa.fft_frequencies(sr=onnxcrepe.SAMPLE_RATE,
+                                          n_fft=onnxcrepe.WINDOW_SIZE)
+
+    # A warning is raised for nearly inaudible frequencies, but it ends up
+    # defaulting to -100 db. That default is fine for our purposes.
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', RuntimeWarning)
+        return librosa.A_weighting(frequencies)[:, None] - REF_DB
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/readme.txt
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/readme.txt
@ -0,0 +1 @@
+modules in this folder from https://github.com/yqzhishen/onnxcrepe at ca7e5d7f2dfca5cc4d99e8d546b00793ca4e7157
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/session.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/session.py
@ -0,0 +1,9 @@
+import os
+
+import onnxruntime as ort
+
+
+class CrepeInferenceSession(ort.InferenceSession):
+    def __init__(self, model='full', sess_options=None, providers=None, provider_options=None, **kwargs):
+        model_path = os.path.join(os.path.dirname(__file__), 'assets', f'{model}.onnx')
+        super().__init__(model_path, sess_options, providers, provider_options, **kwargs)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
@ -0,0 +1,129 @@
+import numpy as np
+
+from voice_changer.RVC.pitchExtractor import onnxcrepe
+
+
+###############################################################################
+# Pitch thresholding methods
+###############################################################################
+
+
+class At:
+    """Simple thresholding at a specified probability value"""
+
+    def __init__(self, value):
+        self.value = value
+
+    def __call__(self, pitch, periodicity):
+        # Make a copy to prevent in-place modification
+        pitch = pitch.copy()
+
+        # Threshold
+        pitch[periodicity < self.value] = onnxcrepe.UNVOICED
+        return pitch
+
+
+class Hysteresis:
+    """Hysteresis thresholding"""
+
+    def __init__(self,
+                 lower_bound=.19,
+                 upper_bound=.31,
+                 width=.2,
+                 stds=1.7,
+                 return_threshold=False):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.width = width
+        self.stds = stds
+        self.return_threshold = return_threshold
+
+    def __call__(self, pitch, periodicity):
+
+        # Perform hysteresis in log-2 space
+        pitch = np.log2(pitch).flatten()
+
+        # Flatten periodicity
+        periodicity = periodicity.flatten()
+
+        # Ignore confidently unvoiced pitch
+        pitch[periodicity < self.lower_bound] = onnxcrepe.UNVOICED
+
+        # Whiten pitch
+        mean, std = np.nanmean(pitch), np.nanstd(pitch)
+        pitch = (pitch - mean) / std
+
+        # Require high confidence to make predictions far from the mean
+        parabola = self.width * pitch ** 2 - self.width * self.stds ** 2
+        threshold = self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound)
+        threshold[np.isnan(threshold)] = self.lower_bound
+
+        # Apply hysteresis to prevent short, unconfident voiced regions
+        i = 0
+        while i < len(periodicity) - 1:
+
+            # Detect unvoiced to voiced transition
+            if periodicity[i] < threshold[i] and periodicity[i + 1] > threshold[i + 1]:
+
+                # Grow region until next unvoiced or end of array
+                start, end, keep = i + 1, i + 1, False
+                while end < len(periodicity) and periodicity[end] > threshold[end]:
+                    if periodicity[end] > self.upper_bound:
+                        keep = True
+                    end += 1
+
+                # Force unvoiced if we didn't pass the confidence required by
+                # the hysteresis
+                if not keep:
+                    threshold[start:end] = 1
+
+                i = end
+
+            else:
+                i += 1
+
+        # Remove pitch with low periodicity
+        pitch[periodicity < threshold] = onnxcrepe.UNVOICED
+
+        # Unwhiten
+        pitch = pitch * std + mean
+
+        # Convert to Hz
+        pitch = np.array(2 ** pitch)[None, :]
+
+        # Optionally return threshold
+        if self.return_threshold:
+            return pitch, np.array(threshold)
+
+        return pitch
+
+
+###############################################################################
+# Periodicity thresholding methods
+###############################################################################
+
+
+class Silence:
+    """Set periodicity to zero in silent regions"""
+
+    def __init__(self, value=-60):
+        self.value = value
+
+    def __call__(self,
+                 periodicity,
+                 audio,
+                 sample_rate=onnxcrepe.SAMPLE_RATE,
+                 precision=None,
+                 pad=True):
+        # Don't modify in-place
+        periodicity = periodicity.copy()
+
+        # Compute loudness
+        hop_length = sample_rate * precision // 1000
+        loudness = onnxcrepe.loudness.a_weighted(
+            audio, sample_rate, hop_length, pad)
+
+        # Threshold silence
+        periodicity[loudness < self.value] = 0.
+
+        return periodicity
--- a/server/voice_changer/utils/VoiceChangerParams.py
+++ b/server/voice_changer/utils/VoiceChangerParams.py
@ -12,3 +12,5 @@ class VoiceChangerParams:
    hubert_soft: str
    nsf_hifigan: str
    sample_mode: str
+    crepe_onnx_full: str
+    crepe_onnx_tiny: str
				`@ -0,0 +1 @@`
				`modules in this folder from https://github.com/yqzhishen/onnxcrepe at ca7e5d7f2dfca5cc4d99e8d546b00793ca4e7157`