New Feature:

- Add Crepe Full/Tiny (onnx) - remove test connect for local Refactor: - RVC: comment out module importer
2023-07-07 02:17:29 +09:00 · 2023-07-07 02:17:29 +09:00 · d5561c2212
commit d5561c2212
parent 099f82cc60
32 changed files with 2068 additions and 117 deletions
--- a/client/demo/dist/assets/gui_settings/GUI.json
+++ b/client/demo/dist/assets/gui_settings/GUI.json
@ -21,7 +21,7 @@
            {
                "name": "configArea",
                "options": {
-                    "detectors": ["dio", "harvest", "crepe"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
                    "inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
                }
            }
--- a/client/demo/dist/index.html
+++ b/client/demo/dist/index.html
@ -1 +1,10 @@
-<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
+<!DOCTYPE html>
 <html style="width: 100%; height: 100%; overflow: hidden">
    <head>
        <meta charset="utf-8" />
        <title>Voice Changer Client Demo</title>
    <script defer src="index.js"></script></head>
    <body style="width: 100%; height: 100%; margin: 0px">
        <div id="app" style="width: 100%; height: 100%"></div>
    </body>
 </html>
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/dist/index.js.LICENSE.txt
+++ b/client/demo/dist/index.js.LICENSE.txt
@ -1,31 +0,0 @@
 /*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
 /**
 * @license React
 * react-dom.production.min.js
 *
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */
 /**
 * @license React
 * react.production.min.js
 *
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */
 /**
 * @license React
 * scheduler.production.min.js
 *
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */
--- a/client/demo/public/assets/gui_settings/GUI.json
+++ b/client/demo/public/assets/gui_settings/GUI.json
@ -21,7 +21,7 @@
            {
                "name": "configArea",
                "options": {
-                    "detectors": ["dio", "harvest", "crepe"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
                    "inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
                }
            }
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -42,8 +42,9 @@ export type CrossFadeOverlapSize = typeof CrossFadeOverlapSize[keyof typeof Cros
 export const F0Detector = {
    "dio": "dio",
    "harvest": "harvest",
    // "parselmouth": "parselmouth",
    "crepe": "crepe",
    "crepe_full": "crepe_full",
    "crepe_tiny": "crepe_tiny",
 } as const
 export type F0Detector = typeof F0Detector[keyof typeof F0Detector]
--- a/server/MMVCServerSIO.py
+++ b/server/MMVCServerSIO.py
@ -34,6 +34,7 @@ def setupArgParser():
    parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
    parser.add_argument("-p", type=int, default=18888, help="port")
    parser.add_argument("--https", type=strtobool, default=False, help="use https")
    parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8")
    parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
    parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
    parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
@ -48,6 +49,8 @@ def setupArgParser():
    parser.add_argument("--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)")
    parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)")
    parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)")
    parser.add_argument("--crepe_onnx_full", type=str, help="path to crepe_onnx_full")
    parser.add_argument("--crepe_onnx_tiny", type=str, help="path to crepe_onnx_tiny")
    return parser
@ -85,6 +88,9 @@ voiceChangerParams = VoiceChangerParams(
    hubert_base_jp=args.hubert_base_jp,
    hubert_soft=args.hubert_soft,
    nsf_hifigan=args.nsf_hifigan,
    crepe_onnx_full=args.crepe_onnx_full,
    crepe_onnx_tiny=args.crepe_onnx_tiny,
    sample_mode=args.sample_mode,
 )
@ -120,6 +126,7 @@ if __name__ == "__mp_main__":
 if __name__ == "__main__":
    mp.freeze_support()
    printMessage(f"PYTHON:{sys.version}", level=2)
    printMessage("Voice Changerを起動しています。", level=2)
    # ダウンロード(Weight)
    try:
@ -195,10 +202,10 @@ if __name__ == "__main__":
        else:
            printMessage(f"http://localhost:{EX_PORT}/", level=1)
    else:  # 直接python起動
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(("8.8.8.8", 80))
        hostname = s.getsockname()[0]
        if args.https == 1:
            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            s.connect((args.test_connect, 80))
            hostname = s.getsockname()[0]
            printMessage(f"https://localhost:{PORT}/", level=1)
            printMessage(f"https://{hostname}:{PORT}/", level=1)
        else:
--- a/server/const.py
+++ b/server/const.py
@ -73,15 +73,13 @@ class EnumInferenceTypes(Enum):
    onnxRVCNono = "onnxRVCNono"
-class EnumPitchExtractorTypes(Enum):
+PitchExtractorType: TypeAlias = Literal[
-    harvest = "harvest"
+    "harvest",
-    dio = "dio"
+    "dio",
-    crepe = "crepe"
+    "crepe",
-
+    "crepe_full",
-
+    "crepe_tiny",
-class EnumFrameworkTypes(Enum):
+]
    pyTorch = "pyTorch"
    onnx = "onnx"
 class ServerAudioDeviceTypes(Enum):
--- a/server/downloader/WeightDownloader.py
+++ b/server/downloader/WeightDownloader.py
@ -11,6 +11,8 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
    hubert_base_jp = voiceChangerParams.hubert_base_jp
    hubert_soft = voiceChangerParams.hubert_soft
    nsf_hifigan = voiceChangerParams.nsf_hifigan
    crepe_onnx_full = voiceChangerParams.crepe_onnx_full
    crepe_onnx_tiny = voiceChangerParams.crepe_onnx_tiny
    # file exists check (currently only for rvc)
    downloadParams = []
@ -57,6 +59,24 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
            }
        )
    if os.path.exists(crepe_onnx_full) is False:
        downloadParams.append(
            {
                "url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/full.onnx",
                "saveTo": crepe_onnx_full,
                "position": 5,
            }
        )
    if os.path.exists(crepe_onnx_tiny) is False:
        downloadParams.append(
            {
                "url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/tiny.onnx",
                "saveTo": crepe_onnx_tiny,
                "position": 6,
            }
        )
    with ThreadPoolExecutor() as pool:
        pool.map(download, downloadParams)
--- a/server/voice_changer/MMVCv15/MMVCv15.py
+++ b/server/voice_changer/MMVCv15/MMVCv15.py
@ -300,5 +300,5 @@ class MMVCv15:
                if file_path.find(remove_path + os.path.sep) >= 0:
                    # print("remove", key, file_path)
                    sys.modules.pop(key)
-            except:  # type:ignore
+            except: # NOQA
                pass
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -1,5 +1,5 @@
-import sys
+# import sys
-import os
+# import os
 from dataclasses import asdict
 import numpy as np
 import torch
@ -7,18 +7,18 @@ import torchaudio
 from data.ModelSlot import RVCModelSlot
-# avoiding parse arg error in RVC
+# # avoiding parse arg error in RVC
-sys.argv = ["MMVCServerSIO.py"]
+# sys.argv = ["MMVCServerSIO.py"]
-if sys.platform.startswith("darwin"):
+# if sys.platform.startswith("darwin"):
-    baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
+#     baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
-    if len(baseDir) != 1:
+#     if len(baseDir) != 1:
-        print("baseDir should be only one ", baseDir)
+#         print("baseDir should be only one ", baseDir)
-        sys.exit()
+#         sys.exit()
-    modulePath = os.path.join(baseDir[0], "RVC")
+#     modulePath = os.path.join(baseDir[0], "RVC")
-    sys.path.append(modulePath)
+#     sys.path.append(modulePath)
-else:
+# else:
-    sys.path.append("RVC")
+#     sys.path.append("RVC")
 from voice_changer.RVC.RVCSettings import RVCSettings
@ -39,9 +39,10 @@ class RVC(VoiceChangerModel):
        print("[Voice Changer] [RVC] Creating instance ")
        self.deviceManager = DeviceManager.get_instance()
        EmbedderManager.initialize(params)
        PitchExtractorManager.initialize(params)
        self.settings = RVCSettings()
        self.params = params
-        self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
+        self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
        self.pipeline: Pipeline | None = None
@ -76,7 +77,7 @@ class RVC(VoiceChangerModel):
        elif key in self.settings.strData:
            setattr(self.settings, key, str(val))
            if key == "f0Detector" and self.pipeline is not None:
-                pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
+                pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
                self.pipeline.setPitchExtractor(pitchExtractor)
        else:
            return False
@ -112,7 +113,7 @@ class RVC(VoiceChangerModel):
            self.audio_buffer = newData
            if self.slotInfo.f0:
                self.pitchf_buffer = np.zeros(new_feature_length)
-            self.feature_buffer =  np.zeros([new_feature_length, self.slotInfo.embChannels])
+            self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
@ -201,21 +202,21 @@ class RVC(VoiceChangerModel):
    def __del__(self):
        del self.pipeline
-        print("---------- REMOVING ---------------")
+        # print("---------- REMOVING ---------------")
-        remove_path = os.path.join("RVC")
+        # remove_path = os.path.join("RVC")
-        sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
+        # sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
-        for key in list(sys.modules):
+        # for key in list(sys.modules):
-            val = sys.modules.get(key)
+        #     val = sys.modules.get(key)
-            try:
+        #     try:
-                file_path = val.__file__
+        #         file_path = val.__file__
-                if file_path.find("RVC" + os.path.sep) >= 0:
+        #         if file_path.find("RVC" + os.path.sep) >= 0:
-                    # print("remove", key, file_path)
+        #             # print("remove", key, file_path)
-                    sys.modules.pop(key)
+        #             sys.modules.pop(key)
-            except Exception:  # type:ignore
+        #     except Exception:  # type:ignore
-                # print(e)
+        #         # print(e)
-                pass
+        #         pass
    def export2onnx(self):
        modelSlot = self.slotInfo
--- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py
+++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py
@ -51,7 +51,7 @@ class OnnxRVCInferencer(Inferencer):
                    "p_len": pitch_length.cpu().numpy().astype(np.int64),
                    "pitch": pitch.cpu().numpy().astype(np.int64),
                    "pitchf": pitchf.cpu().numpy().astype(np.float32),
-                    "sid": sid.cpu().numpy().astype(np.int64)               
+                    "sid": sid.cpu().numpy().astype(np.int64)
                },
            )
        else:
@ -62,7 +62,7 @@ class OnnxRVCInferencer(Inferencer):
                    "p_len": pitch_length.cpu().numpy().astype(np.int64),
                    "pitch": pitch.cpu().numpy().astype(np.int64),
                    "pitchf": pitchf.cpu().numpy().astype(np.float32),
-                    "sid": sid.cpu().numpy().astype(np.int64) 
+                    "sid": sid.cpu().numpy().astype(np.int64)
                },
            )
--- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
+++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
@ -4,6 +4,7 @@ from const import EnumInferenceTypes
 from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
 class OnnxRVCInferencerNono(OnnxRVCInferencer):
    def loadModel(self, file: str, gpu: int):
        super().loadModel(file, gpu)
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -73,9 +73,9 @@ class Pipeline(object):
    def exec(
        self,
        sid,
-        audio, # torch.tensor [n]
+        audio,  # torch.tensor [n]
-        pitchf, # np.array [m]
+        pitchf,  # np.array [m]
-        feature, # np.array [m, feat]
+        feature,  # np.array [m, feat]
        f0_up_key,
        index_rate,
        if_f0,
@ -208,13 +208,12 @@ class Pipeline(object):
        # apply silent front for inference
        if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
            npyOffset = math.floor(silence_front * 16000) // 360
-            feats = feats[:, npyOffset * 2 :, :]
+            feats = feats[:, npyOffset * 2 :, :]  # NOQA
            feats_len = feats.shape[1]
            if pitch is not None and pitchf is not None:
                pitch = pitch[:, -feats_len:]
                pitchf = pitchf[:, -feats_len:]
            p_len = torch.tensor([feats_len], device=self.device).long()
        # 推論実行
        try:
--- a/server/voice_changer/RVC/pipeline/PipelineGenerator.py
+++ b/server/voice_changer/RVC/pipeline/PipelineGenerator.py
@ -34,7 +34,7 @@ def createPipeline(modelSlot: RVCModelSlot, gpu: int, f0Detector: str):
        traceback.print_exc()
    # pitchExtractor
-    pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector)
+    pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
    # index, feature
    index = _loadIndex(modelSlot)
@ -67,7 +67,7 @@ def _loadIndex(modelSlot: RVCModelSlot):
    try:
        print("Try loading...", modelSlot.indexFile)
        index = faiss.read_index(modelSlot.indexFile)
-    except:
+    except: # NOQA
        print("[Voice Changer] load index failed. Use no index.")
        traceback.print_exc()
        return None
--- a/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
@ -0,0 +1,68 @@
 import numpy as np
 from const import PitchExtractorType
 from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 import onnxruntime
 from voice_changer.RVC.pitchExtractor import onnxcrepe
 class CrepeOnnxPitchExtractor(PitchExtractor):
    def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
        self.pitchExtractorType = pitchExtractorType
        super().__init__()
        (
            onnxProviders,
            onnxProviderOptions,
        ) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
        self.onnx_session = onnxruntime.InferenceSession(
            file, providers=onnxProviders, provider_options=onnxProviderOptions
        )
    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
        n_frames = int(len(audio) // window) + 1
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr
        silence_front_offset = int(np.round(real_silence_front * sr))
        audio = audio[silence_front_offset:]
        f0_min = 50
        f0_max = 1100
        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
        precision = 10.0
        audio_num = audio.cpu()
        onnx_f0, onnx_pd = onnxcrepe.predict(
            self.onnx_session,
            audio_num,
            sr,
            precision=precision,
            fmin=f0_min,
            fmax=f0_max,
            batch_size=256,
            return_periodicity=True,
            decoder=onnxcrepe.decode.weighted_argmax,
            )
        f0 = onnxcrepe.filter.median(onnx_f0, 3)
        pd = onnxcrepe.filter.median(onnx_pd, 3)
        f0[pd < 0.1] = 0
        f0 = f0.squeeze()
        f0 = np.pad(f0, (start_frame, n_frames - f0.shape[0] - start_frame), 'constant', constant_values=(0, 0))
        f0 *= pow(2, f0_up_key / 12)
        pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
        f0bak = pitchf.copy()
        f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
        f0_mel = np.clip(
            (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
        )
        pitch_coarse = f0_mel.astype(int)
        return pitch_coarse, pitchf
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@ -1,16 +1,16 @@
 import torchcrepe
 import torch
 import numpy as np
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType
 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 class CrepePitchExtractor(PitchExtractor):
    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.crepe
    def __init__(self):
        super().__init__()
        self.pitchExtractorType: PitchExtractorType = "crepe"
        if torch.cuda.is_available():
            self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
        else:
--- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
@ -1,16 +1,19 @@
 import pyworld
 import numpy as np
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType
 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 class DioPitchExtractor(PitchExtractor):
-    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio
+
    def __init__(self):
        super().__init__()
        self.pitchExtractorType: PitchExtractorType = "dio"
    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
        audio = audio.detach().cpu().numpy()
-        n_frames = int(len(audio) // window) + 1
+        n_frames = int(len(audio) // window) + 1  # NOQA
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr
@ -31,7 +34,7 @@ class DioPitchExtractor(PitchExtractor):
            frame_period=10,
        )
        f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
-       # f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
+        # f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
        f0 *= pow(2, f0_up_key / 12)
        pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
@ -45,4 +48,3 @@ class DioPitchExtractor(PitchExtractor):
        pitch_coarse = np.rint(f0_mel).astype(int)
        return pitch_coarse, pitchf
--- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
@ -1,17 +1,20 @@
 import pyworld
 import numpy as np
 import scipy.signal as signal
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType
 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 class HarvestPitchExtractor(PitchExtractor):
-    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
+
    def __init__(self):
        super().__init__()
        self.pitchExtractorType: PitchExtractorType = "harvest"
    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
        audio = audio.detach().cpu().numpy()
-        n_frames = int(len(audio) // window) + 1
+        n_frames = int(len(audio) // window) + 1  # NOQA
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr
--- a/server/voice_changer/RVC/pitchExtractor/PitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractor.py
@ -1,14 +1,12 @@
 from typing import Protocol
 from const import EnumPitchExtractorTypes
 class PitchExtractor(Protocol):
    pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
    def extract(self, audio, f0_up_key, sr, window, silence_front=0):
        ...
    def getPitchExtractorInfo(self):
        return {
-            "pitchExtractorType": self.pitchExtractorType.value,
+            "pitchExtractorType": self.pitchExtractorType,
        }
--- a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py
+++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py
@ -1,40 +1,42 @@
 from typing import Protocol
-from const import EnumPitchExtractorTypes
+from const import PitchExtractorType
 from voice_changer.RVC.pitchExtractor.CrepeOnnxPitchExtractor import CrepeOnnxPitchExtractor
 from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
 from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
 from voice_changer.RVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
 class PitchExtractorManager(Protocol):
    currentPitchExtractor: PitchExtractor | None = None
    params: VoiceChangerParams
    @classmethod
    def initialize(cls, params: VoiceChangerParams):
        cls.params = params
    @classmethod
    def getPitchExtractor(
-        cls, pitchExtractorType: EnumPitchExtractorTypes
+        cls, pitchExtractorType: PitchExtractorType, gpu: int
    ) -> PitchExtractor:
-        cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType)
+        cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType,  gpu)
        return cls.currentPitchExtractor
    @classmethod
    def loadPitchExtractor(
-        cls, pitchExtractorType: EnumPitchExtractorTypes
+        cls, pitchExtractorType: PitchExtractorType, gpu: int
    ) -> PitchExtractor:
-        if (
+        if pitchExtractorType == "harvest":
            pitchExtractorType == EnumPitchExtractorTypes.harvest
            or pitchExtractorType == EnumPitchExtractorTypes.harvest.value
        ):
            return HarvestPitchExtractor()
-        elif (
+        elif pitchExtractorType == "dio":
            pitchExtractorType == EnumPitchExtractorTypes.dio
            or pitchExtractorType == EnumPitchExtractorTypes.dio.value
        ):
            return DioPitchExtractor()
-        elif (
+        elif pitchExtractorType == "crepe":
            pitchExtractorType == EnumPitchExtractorTypes.crepe
            or pitchExtractorType == EnumPitchExtractorTypes.crepe.value
        ):
            return CrepePitchExtractor()
        elif pitchExtractorType == "crepe_tiny":
            return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
        elif pitchExtractorType == "crepe_full":
            return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
        else:
            # return hubert as default
            raise RuntimeError(
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/init.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/init.py
@ -0,0 +1,8 @@
 from . import decode  # NOQA
 from .core import *  # NOQA
 from . import convert  # NOQA
 from . import filter  # NOQA
 from . import load  # NOQA
 from . import loudness  # NOQA
 from .session import CrepeInferenceSession  # NOQA
 from . import threshold  # NOQA
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
@ -0,0 +1,57 @@
 import numpy as np
 import scipy
 from voice_changer.RVC.pitchExtractor import onnxcrepe
 ###############################################################################
 # Pitch unit conversions
 ###############################################################################
 def bins_to_cents(bins, apply_dither=False):
    """Converts pitch bins to cents"""
    cents = onnxcrepe.CENTS_PER_BIN * bins + 1997.3794084376191
    # Trade quantization error for noise (disabled by default)
    return dither(cents) if apply_dither else cents
 def bins_to_frequency(bins, apply_dither=False):
    """Converts pitch bins to frequency in Hz"""
    return cents_to_frequency(bins_to_cents(bins, apply_dither=apply_dither))
 def cents_to_bins(cents, quantize_fn=np.floor):
    """Converts cents to pitch bins"""
    bins = (cents - 1997.3794084376191) / onnxcrepe.CENTS_PER_BIN
    return quantize_fn(bins).astype(np.int64)
 def cents_to_frequency(cents):
    """Converts cents to frequency in Hz"""
    return 10 * 2 ** (cents / 1200)
 def frequency_to_bins(frequency, quantize_fn=np.floor):
    """Convert frequency in Hz to pitch bins"""
    return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
 def frequency_to_cents(frequency):
    """Convert frequency in Hz to cents"""
    return 1200 * np.log2(frequency / 10.)
 ###############################################################################
 # Utilities
 ###############################################################################
 def dither(cents):
    """Dither the predicted pitch in cents to remove quantization error"""
    noise = scipy.stats.triang.rvs(c=0.5,
                                   loc=-onnxcrepe.CENTS_PER_BIN,
                                   scale=2 * onnxcrepe.CENTS_PER_BIN,
                                   size=cents.shape)
    return cents + noise.astype(cents.dtype)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
@ -0,0 +1,256 @@
 import librosa
 import numpy as np
 from voice_changer.RVC.pitchExtractor import onnxcrepe
 __all__ = ['CENTS_PER_BIN',
           'MAX_FMAX',
           'PITCH_BINS',
           'SAMPLE_RATE',
           'WINDOW_SIZE',
           'UNVOICED',
           'predict',
           'preprocess',
           'infer',
           'postprocess',
           'resample']
 ###############################################################################
 # Constants
 ###############################################################################
 CENTS_PER_BIN = 20  # cents
 MAX_FMAX = 2006.  # hz
 PITCH_BINS = 360
 SAMPLE_RATE = 16000  # hz
 WINDOW_SIZE = 1024  # samples
 UNVOICED = np.nan
 ###############################################################################
 # Crepe pitch prediction
 ###############################################################################
 def predict(session,
            audio,
            sample_rate,
            precision=None,
            fmin=50.,
            fmax=MAX_FMAX,
            decoder=onnxcrepe.decode.weighted_viterbi,
            return_periodicity=False,
            batch_size=None,
            pad=True):
    """Performs pitch estimation
    Arguments
        session (onnxcrepe.CrepeInferenceSession)
            An onnxruntime.InferenceSession holding the CREPE model
        audio (numpy.ndarray [shape=(n_samples,)])
            The audio signal
        sample_rate (int)
            The sampling rate in Hz
        precision (float)
            The precision in milliseconds, i.e. the length of each frame
        fmin (float)
            The minimum allowable frequency in Hz
        fmax (float)
            The maximum allowable frequency in Hz
        decoder (function)
            The decoder to use. See decode.py for decoders.
        return_periodicity (bool)
            Whether to also return the network confidence
        batch_size (int)
            The number of frames per batch
        pad (bool)
            Whether to zero-pad the audio
    Returns
        pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
        (Optional) periodicity (numpy.ndarray
                                [shape=(1, 1 + int(time // precision))])
    """
    results = []
    # Preprocess audio
    generator = preprocess(audio,
                           sample_rate,
                           precision,
                           batch_size,
                           pad)
    for frames in generator:
        # Infer independent probabilities for each pitch bin
        probabilities = infer(session, frames)  # shape=(batch, 360)
        probabilities = probabilities.transpose(1, 0)[None]  # shape=(1, 360, batch)
        # Convert probabilities to F0 and periodicity
        result = postprocess(probabilities,
                             fmin,
                             fmax,
                             decoder,
                             return_periodicity)
        # Place on same device as audio to allow very long inputs
        if isinstance(result, tuple):
            result = (result[0], result[1])
        results.append(result)
    # Split pitch and periodicity
    if return_periodicity:
        pitch, periodicity = zip(*results)
        return np.concatenate(pitch, axis=1), np.concatenate(periodicity, axis=1)
    # Concatenate
    return np.concatenate(results, axis=1)
 def preprocess(audio,
               sample_rate,
               precision=None,
               batch_size=None,
               pad=True):
    """Convert audio to model input
    Arguments
        audio (numpy.ndarray [shape=(time,)])
            The audio signals
        sample_rate (int)
            The sampling rate in Hz
        precision (float)
            The precision in milliseconds, i.e. the length of each frame
        batch_size (int)
            The number of frames per batch
        pad (bool)
            Whether to zero-pad the audio
    Returns
        frames (numpy.ndarray [shape=(1 + int(time // precision), 1024)])
    """
    # Resample
    if sample_rate != SAMPLE_RATE:
        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
    # Default hop length of 10 ms
    hop_length = SAMPLE_RATE / 100 if precision is None else SAMPLE_RATE * precision / 1000
    # Get total number of frames
    # Maybe pad
    if pad:
        total_frames = 1 + int(audio.shape[0] / hop_length)
        audio = np.pad(
            audio,
            (WINDOW_SIZE // 2, WINDOW_SIZE // 2))
    else:
        total_frames = 1 + int((audio.shape[0] - WINDOW_SIZE) / hop_length)
    # Default to running all frames in a single batch
    batch_size = total_frames if batch_size is None else batch_size
    # Generate batches
    for i in range(0, total_frames, batch_size):
        # Batch indices
        start = max(0, int(i * hop_length))
        end = min(audio.shape[0],
                  int((i + batch_size - 1) * hop_length) + WINDOW_SIZE)
        # Chunk
        n_bytes = audio.strides[-1]
        frames = np.lib.stride_tricks.as_strided(
            audio[start:end],
            shape=((end - start - WINDOW_SIZE) // int(hop_length) + 1, WINDOW_SIZE),
            strides=(int(hop_length) * n_bytes, n_bytes))  # shape=(batch, 1024)
        # Note:
        # Z-score standardization operations originally located here
        # (https://github.com/maxrmorrison/torchcrepe/blob/master/torchcrepe/core.py#L692)
        # are wrapped into the ONNX models for hardware acceleration.
        yield frames
 def infer(session, frames):
    """Forward pass through the model
    Arguments
        session (onnxcrepe.CrepeInferenceSession)
            An onnxruntime.InferenceSession holding the CREPE model
        frames (numpy.ndarray [shape=(time / precision, 1024)])
            The network input
    Returns
        logits (numpy.ndarray [shape=(1 + int(time // precision), 360)])
    """
    # Apply model
    return session.run(None, {'frames': frames})[0]
 def postprocess(probabilities,
                fmin=0.,
                fmax=MAX_FMAX,
                decoder=onnxcrepe.decode.weighted_viterbi,
                return_periodicity=False):
    """Convert model output to F0 and periodicity
    Arguments
        probabilities (numpy.ndarray [shape=(1, 360, time / precision)])
            The probabilities for each pitch bin inferred by the network
        fmin (float)
            The minimum allowable frequency in Hz
        fmax (float)
            The maximum allowable frequency in Hz
        decoder (function)
            The decoder to use. See decode.py for decoders.
        return_periodicity (bool)
            Whether to also return the network confidence
    Returns
        pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
        periodicity (numpy.ndarray [shape=(1, 1 + int(time // precision))])
    """
    # Convert frequency range to pitch bin range
    minidx = onnxcrepe.convert.frequency_to_bins(fmin)
    maxidx = onnxcrepe.convert.frequency_to_bins(fmax, np.ceil)
    # Remove frequencies outside allowable range
    probabilities[:, :minidx] = float('-inf')
    probabilities[:, maxidx:] = float('-inf')
    # Perform argmax or viterbi sampling
    bins, pitch = decoder(probabilities)
    if not return_periodicity:
        return pitch
    # Compute periodicity from probabilities and decoded pitch bins
    return pitch, periodicity(probabilities, bins)
 ###############################################################################
 # Utilities
 ###############################################################################
 def periodicity(probabilities, bins):
    """Computes the periodicity from the network output and pitch bins"""
    # shape=(time / precision, 360)
    probs_stacked = probabilities.transpose(0, 2, 1).reshape(-1, PITCH_BINS)
    # shape=(time / precision, 1)
    bins_stacked = bins.reshape(-1, 1).astype(np.int64)
    # Use maximum logit over pitch bins as periodicity
    periodicity = np.take_along_axis(probs_stacked, bins_stacked, axis=1)
    # shape=(batch, time / precision)
    return periodicity.reshape(probabilities.shape[0], probabilities.shape[2])
 def resample(audio, sample_rate):
    """Resample audio"""
    return librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
@ -0,0 +1,80 @@
 import librosa
 import numpy as np
 from voice_changer.RVC.pitchExtractor import onnxcrepe
 ###############################################################################
 # Probability sequence decoding methods
 ###############################################################################
 def argmax(logits):
    """Sample observations by taking the argmax"""
    bins = logits.argmax(axis=1)
    # Convert to frequency in Hz
    return bins, onnxcrepe.convert.bins_to_frequency(bins)
 def weighted_argmax(logits: np.ndarray):
    """Sample observations using weighted sum near the argmax"""
    # Find center of analysis window
    bins = logits.argmax(axis=1)
    return bins, _apply_weights(logits, bins)
 def viterbi(logits):
    """Sample observations using viterbi decoding"""
    # Create viterbi transition matrix
    if not hasattr(viterbi, 'transition'):
        xx, yy = np.meshgrid(range(360), range(360))
        transition = np.maximum(12 - abs(xx - yy), 0)
        transition = transition / transition.sum(axis=1, keepdims=True)
        viterbi.transition = transition
    # Normalize logits (softmax)
    logits -= logits.max(axis=1)
    exp = np.exp(logits)
    probs = exp / np.sum(exp, axis=1)
    # Perform viterbi decoding
    bins = np.array([
        librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
        for sequence in probs])
    # Convert to frequency in Hz
    return bins, onnxcrepe.convert.bins_to_frequency(bins)
 def weighted_viterbi(logits):
    """Sample observations combining viterbi decoding and weighted argmax"""
    bins, _ = viterbi(logits)
    return bins, _apply_weights(logits, bins)
 def _apply_weights(logits, bins):
    # Find bounds of analysis window
    start = np.maximum(0, bins - 4)
    end = np.minimum(logits.shape[1], bins + 5)
    # Mask out everything outside of window
    for batch in range(logits.shape[0]):
        for time in range(logits.shape[2]):
            logits[batch, :start[batch, time], time] = float('-inf')
            logits[batch, end[batch, time]:, time] = float('-inf')
    # Construct weights
    if not hasattr(_apply_weights, 'weights'):
        weights = onnxcrepe.convert.bins_to_cents(np.arange(360))
        _apply_weights.weights = weights[None, :, None]
    # Convert to probabilities (ReLU)
    probs = np.maximum(0, logits)
    # Apply weights
    cents = (_apply_weights.weights * probs).sum(axis=1) / probs.sum(axis=1)
    # Convert to frequency in Hz
    return onnxcrepe.convert.cents_to_frequency(cents)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
@ -0,0 +1,125 @@
 import numpy as np
 ###############################################################################
 # Sequence filters
 ###############################################################################
 def mean(signals, win_length=9):
    """Averave filtering for signals containing nan values
    Arguments
        signals (numpy.ndarray (shape=(batch, time)))
            The signals to filter
        win_length
            The size of the analysis window
    Returns
        filtered (numpy.ndarray (shape=(batch, time)))
    """
    return nanfilter(signals, win_length, nanmean)
 def median(signals, win_length):
    """Median filtering for signals containing nan values
    Arguments
        signals (numpy.ndarray (shape=(batch, time)))
            The signals to filter
        win_length
            The size of the analysis window
    Returns
        filtered (numpy.ndarray (shape=(batch, time)))
    """
    return nanfilter(signals, win_length, nanmedian)
 ###############################################################################
 # Utilities
 ###############################################################################
 def nanfilter(signals, win_length, filter_fn):
    """Filters a sequence, ignoring nan values
    Arguments
        signals (numpy.ndarray (shape=(batch, time)))
            The signals to filter
        win_length
            The size of the analysis window
        filter_fn (function)
            The function to use for filtering
    Returns
        filtered (numpy.ndarray (shape=(batch, time)))
    """
    # Output buffer
    filtered = np.empty_like(signals)
    # Loop over frames
    for i in range(signals.shape[1]):
        # Get analysis window bounds
        start = max(0, i - win_length // 2)
        end = min(signals.shape[1], i + win_length // 2 + 1)
        # Apply filter to window
        filtered[:, i] = filter_fn(signals[:, start:end])
    return filtered
 def nanmean(signals):
    """Computes the mean, ignoring nans
    Arguments
        signals (numpy.ndarray [shape=(batch, time)])
            The signals to filter
    Returns
        filtered (numpy.ndarray [shape=(batch, time)])
    """
    signals = signals.clone()
    # Find nans
    nans = np.isnan(signals)
    # Set nans to 0.
    signals[nans] = 0.
    # Compute average
    return signals.sum(axis=1) / (~nans).astype(np.float32).sum(axis=1)
 def nanmedian(signals):
    """Computes the median, ignoring nans
    Arguments
        signals (numpy.ndarray [shape=(batch, time)])
            The signals to filter
    Returns
        filtered (numpy.ndarray [shape=(batch, time)])
    """
    # Find nans
    nans = np.isnan(signals)
    # Compute median for each slice
    medians = [nanmedian1d(signal[~nan]) for signal, nan in zip(signals, nans)]
    # Stack results
    return np.array(medians, dtype=signals.dtype)
 def nanmedian1d(signal):
    """Computes the median. If signal is empty, returns torch.nan
    Arguments
        signal (numpy.ndarray [shape=(time,)])
    Returns
        median (numpy.ndarray [shape=(1,)])
    """
    return np.median(signal) if signal.size else np.nan
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
@ -0,0 +1,12 @@
 import librosa
 import numpy as np
 def audio(filename):
    """Load audio from disk"""
    samples, sr = librosa.load(filename, sr=None)
    if len(samples.shape) > 1:
        # To mono
        samples = np.mean(samples, axis=1)
    return samples, sr
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/loudness.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/loudness.py
@ -0,0 +1,73 @@
 import warnings
 import librosa
 import numpy as np
 from voice_changer.RVC.pitchExtractor import onnxcrepe
 ###############################################################################
 # Constants
 ###############################################################################
 # Minimum decibel level
 MIN_DB = -100.
 # Reference decibel level
 REF_DB = 20.
 ###############################################################################
 # A-weighted loudness
 ###############################################################################
 def a_weighted(audio, sample_rate, hop_length=None, pad=True):
    """Retrieve the per-frame loudness"""
    # Default hop length of 10 ms
    hop_length = sample_rate // 100 if hop_length is None else hop_length
    # Convert to numpy
    audio = audio.squeeze(0)
    # Resample
    if sample_rate != onnxcrepe.SAMPLE_RATE:
        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
        hop_length = int(hop_length * onnxcrepe.SAMPLE_RATE / sample_rate)
    # Cache weights
    if not hasattr(a_weighted, 'weights'):
        a_weighted.weights = perceptual_weights()
    # Take stft
    stft = librosa.stft(audio,
                        n_fft=onnxcrepe.WINDOW_SIZE,
                        hop_length=hop_length,
                        win_length=onnxcrepe.WINDOW_SIZE,
                        center=pad,
                        pad_mode='constant')
    # Compute magnitude on db scale
    db = librosa.amplitude_to_db(np.abs(stft))
    # Apply A-weighting
    weighted = db + a_weighted.weights
    # Threshold
    weighted[weighted < MIN_DB] = MIN_DB
    # Average over weighted frequencies
    return weighted.mean(axis=0).astype(np.float32)[None]
 def perceptual_weights():
    """A-weighted frequency-dependent perceptual loudness weights"""
    frequencies = librosa.fft_frequencies(sr=onnxcrepe.SAMPLE_RATE,
                                          n_fft=onnxcrepe.WINDOW_SIZE)
    # A warning is raised for nearly inaudible frequencies, but it ends up
    # defaulting to -100 db. That default is fine for our purposes.
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', RuntimeWarning)
        return librosa.A_weighting(frequencies)[:, None] - REF_DB
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/readme.txt
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/readme.txt
@ -0,0 +1 @@
 modules in this folder from https://github.com/yqzhishen/onnxcrepe at ca7e5d7f2dfca5cc4d99e8d546b00793ca4e7157
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/session.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/session.py
@ -0,0 +1,9 @@
 import os
 import onnxruntime as ort
 class CrepeInferenceSession(ort.InferenceSession):
    def __init__(self, model='full', sess_options=None, providers=None, provider_options=None, **kwargs):
        model_path = os.path.join(os.path.dirname(__file__), 'assets', f'{model}.onnx')
        super().__init__(model_path, sess_options, providers, provider_options, **kwargs)
--- a/server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
+++ b/server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
@ -0,0 +1,129 @@
 import numpy as np
 from voice_changer.RVC.pitchExtractor import onnxcrepe
 ###############################################################################
 # Pitch thresholding methods
 ###############################################################################
 class At:
    """Simple thresholding at a specified probability value"""
    def __init__(self, value):
        self.value = value
    def __call__(self, pitch, periodicity):
        # Make a copy to prevent in-place modification
        pitch = pitch.copy()
        # Threshold
        pitch[periodicity < self.value] = onnxcrepe.UNVOICED
        return pitch
 class Hysteresis:
    """Hysteresis thresholding"""
    def __init__(self,
                 lower_bound=.19,
                 upper_bound=.31,
                 width=.2,
                 stds=1.7,
                 return_threshold=False):
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        self.width = width
        self.stds = stds
        self.return_threshold = return_threshold
    def __call__(self, pitch, periodicity):
        # Perform hysteresis in log-2 space
        pitch = np.log2(pitch).flatten()
        # Flatten periodicity
        periodicity = periodicity.flatten()
        # Ignore confidently unvoiced pitch
        pitch[periodicity < self.lower_bound] = onnxcrepe.UNVOICED
        # Whiten pitch
        mean, std = np.nanmean(pitch), np.nanstd(pitch)
        pitch = (pitch - mean) / std
        # Require high confidence to make predictions far from the mean
        parabola = self.width * pitch ** 2 - self.width * self.stds ** 2
        threshold = self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound)
        threshold[np.isnan(threshold)] = self.lower_bound
        # Apply hysteresis to prevent short, unconfident voiced regions
        i = 0
        while i < len(periodicity) - 1:
            # Detect unvoiced to voiced transition
            if periodicity[i] < threshold[i] and periodicity[i + 1] > threshold[i + 1]:
                # Grow region until next unvoiced or end of array
                start, end, keep = i + 1, i + 1, False
                while end < len(periodicity) and periodicity[end] > threshold[end]:
                    if periodicity[end] > self.upper_bound:
                        keep = True
                    end += 1
                # Force unvoiced if we didn't pass the confidence required by
                # the hysteresis
                if not keep:
                    threshold[start:end] = 1
                i = end
            else:
                i += 1
        # Remove pitch with low periodicity
        pitch[periodicity < threshold] = onnxcrepe.UNVOICED
        # Unwhiten
        pitch = pitch * std + mean
        # Convert to Hz
        pitch = np.array(2 ** pitch)[None, :]
        # Optionally return threshold
        if self.return_threshold:
            return pitch, np.array(threshold)
        return pitch
 ###############################################################################
 # Periodicity thresholding methods
 ###############################################################################
 class Silence:
    """Set periodicity to zero in silent regions"""
    def __init__(self, value=-60):
        self.value = value
    def __call__(self,
                 periodicity,
                 audio,
                 sample_rate=onnxcrepe.SAMPLE_RATE,
                 precision=None,
                 pad=True):
        # Don't modify in-place
        periodicity = periodicity.copy()
        # Compute loudness
        hop_length = sample_rate * precision // 1000
        loudness = onnxcrepe.loudness.a_weighted(
            audio, sample_rate, hop_length, pad)
        # Threshold silence
        periodicity[loudness < self.value] = 0.
        return periodicity
--- a/server/voice_changer/utils/VoiceChangerParams.py
+++ b/server/voice_changer/utils/VoiceChangerParams.py
@ -12,3 +12,5 @@ class VoiceChangerParams:
    hubert_soft: str
    nsf_hifigan: str
    sample_mode: str
    crepe_onnx_full: str
    crepe_onnx_tiny: str
		`@ -0,0 +1 @@`
							`modules in this folder from https://github.com/yqzhishen/onnxcrepe at ca7e5d7f2dfca5cc4d99e8d546b00793ca4e7157`