New Feature:
- Add Crepe Full/Tiny (onnx) - remove test connect for local Refactor: - RVC: comment out module importer
This commit is contained in:
parent
099f82cc60
commit
d5561c2212
@ -21,7 +21,7 @@
|
|||||||
{
|
{
|
||||||
"name": "configArea",
|
"name": "configArea",
|
||||||
"options": {
|
"options": {
|
||||||
"detectors": ["dio", "harvest", "crepe"],
|
"detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
|
||||||
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
|
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
11
client/demo/dist/index.html
vendored
11
client/demo/dist/index.html
vendored
@ -1 +1,10 @@
|
|||||||
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
|
<!DOCTYPE html>
|
||||||
|
<html style="width: 100%; height: 100%; overflow: hidden">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<title>Voice Changer Client Demo</title>
|
||||||
|
<script defer src="index.js"></script></head>
|
||||||
|
<body style="width: 100%; height: 100%; margin: 0px">
|
||||||
|
<div id="app" style="width: 100%; height: 100%"></div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
1125
client/demo/dist/index.js
vendored
1125
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
31
client/demo/dist/index.js.LICENSE.txt
vendored
31
client/demo/dist/index.js.LICENSE.txt
vendored
@ -1,31 +0,0 @@
|
|||||||
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @license React
|
|
||||||
* react-dom.production.min.js
|
|
||||||
*
|
|
||||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
||||||
*
|
|
||||||
* This source code is licensed under the MIT license found in the
|
|
||||||
* LICENSE file in the root directory of this source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @license React
|
|
||||||
* react.production.min.js
|
|
||||||
*
|
|
||||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
||||||
*
|
|
||||||
* This source code is licensed under the MIT license found in the
|
|
||||||
* LICENSE file in the root directory of this source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @license React
|
|
||||||
* scheduler.production.min.js
|
|
||||||
*
|
|
||||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
||||||
*
|
|
||||||
* This source code is licensed under the MIT license found in the
|
|
||||||
* LICENSE file in the root directory of this source tree.
|
|
||||||
*/
|
|
@ -21,7 +21,7 @@
|
|||||||
{
|
{
|
||||||
"name": "configArea",
|
"name": "configArea",
|
||||||
"options": {
|
"options": {
|
||||||
"detectors": ["dio", "harvest", "crepe"],
|
"detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
|
||||||
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
|
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -42,8 +42,9 @@ export type CrossFadeOverlapSize = typeof CrossFadeOverlapSize[keyof typeof Cros
|
|||||||
export const F0Detector = {
|
export const F0Detector = {
|
||||||
"dio": "dio",
|
"dio": "dio",
|
||||||
"harvest": "harvest",
|
"harvest": "harvest",
|
||||||
// "parselmouth": "parselmouth",
|
|
||||||
"crepe": "crepe",
|
"crepe": "crepe",
|
||||||
|
"crepe_full": "crepe_full",
|
||||||
|
"crepe_tiny": "crepe_tiny",
|
||||||
} as const
|
} as const
|
||||||
export type F0Detector = typeof F0Detector[keyof typeof F0Detector]
|
export type F0Detector = typeof F0Detector[keyof typeof F0Detector]
|
||||||
|
|
||||||
|
@ -34,6 +34,7 @@ def setupArgParser():
|
|||||||
parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
|
parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
|
||||||
parser.add_argument("-p", type=int, default=18888, help="port")
|
parser.add_argument("-p", type=int, default=18888, help="port")
|
||||||
parser.add_argument("--https", type=strtobool, default=False, help="use https")
|
parser.add_argument("--https", type=strtobool, default=False, help="use https")
|
||||||
|
parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8")
|
||||||
parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
|
parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
|
||||||
parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
|
parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
|
||||||
parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
|
parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
|
||||||
@ -48,6 +49,8 @@ def setupArgParser():
|
|||||||
parser.add_argument("--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)")
|
parser.add_argument("--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)")
|
||||||
parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)")
|
parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)")
|
||||||
parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)")
|
parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)")
|
||||||
|
parser.add_argument("--crepe_onnx_full", type=str, help="path to crepe_onnx_full")
|
||||||
|
parser.add_argument("--crepe_onnx_tiny", type=str, help="path to crepe_onnx_tiny")
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
@ -85,6 +88,9 @@ voiceChangerParams = VoiceChangerParams(
|
|||||||
hubert_base_jp=args.hubert_base_jp,
|
hubert_base_jp=args.hubert_base_jp,
|
||||||
hubert_soft=args.hubert_soft,
|
hubert_soft=args.hubert_soft,
|
||||||
nsf_hifigan=args.nsf_hifigan,
|
nsf_hifigan=args.nsf_hifigan,
|
||||||
|
crepe_onnx_full=args.crepe_onnx_full,
|
||||||
|
crepe_onnx_tiny=args.crepe_onnx_tiny,
|
||||||
|
|
||||||
sample_mode=args.sample_mode,
|
sample_mode=args.sample_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -120,6 +126,7 @@ if __name__ == "__mp_main__":
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
mp.freeze_support()
|
mp.freeze_support()
|
||||||
|
|
||||||
|
printMessage(f"PYTHON:{sys.version}", level=2)
|
||||||
printMessage("Voice Changerを起動しています。", level=2)
|
printMessage("Voice Changerを起動しています。", level=2)
|
||||||
# ダウンロード(Weight)
|
# ダウンロード(Weight)
|
||||||
try:
|
try:
|
||||||
@ -195,10 +202,10 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
printMessage(f"http://localhost:{EX_PORT}/", level=1)
|
printMessage(f"http://localhost:{EX_PORT}/", level=1)
|
||||||
else: # 直接python起動
|
else: # 直接python起動
|
||||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
||||||
s.connect(("8.8.8.8", 80))
|
|
||||||
hostname = s.getsockname()[0]
|
|
||||||
if args.https == 1:
|
if args.https == 1:
|
||||||
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||||
|
s.connect((args.test_connect, 80))
|
||||||
|
hostname = s.getsockname()[0]
|
||||||
printMessage(f"https://localhost:{PORT}/", level=1)
|
printMessage(f"https://localhost:{PORT}/", level=1)
|
||||||
printMessage(f"https://{hostname}:{PORT}/", level=1)
|
printMessage(f"https://{hostname}:{PORT}/", level=1)
|
||||||
else:
|
else:
|
||||||
|
@ -73,15 +73,13 @@ class EnumInferenceTypes(Enum):
|
|||||||
onnxRVCNono = "onnxRVCNono"
|
onnxRVCNono = "onnxRVCNono"
|
||||||
|
|
||||||
|
|
||||||
class EnumPitchExtractorTypes(Enum):
|
PitchExtractorType: TypeAlias = Literal[
|
||||||
harvest = "harvest"
|
"harvest",
|
||||||
dio = "dio"
|
"dio",
|
||||||
crepe = "crepe"
|
"crepe",
|
||||||
|
"crepe_full",
|
||||||
|
"crepe_tiny",
|
||||||
class EnumFrameworkTypes(Enum):
|
]
|
||||||
pyTorch = "pyTorch"
|
|
||||||
onnx = "onnx"
|
|
||||||
|
|
||||||
|
|
||||||
class ServerAudioDeviceTypes(Enum):
|
class ServerAudioDeviceTypes(Enum):
|
||||||
|
@ -11,6 +11,8 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
|||||||
hubert_base_jp = voiceChangerParams.hubert_base_jp
|
hubert_base_jp = voiceChangerParams.hubert_base_jp
|
||||||
hubert_soft = voiceChangerParams.hubert_soft
|
hubert_soft = voiceChangerParams.hubert_soft
|
||||||
nsf_hifigan = voiceChangerParams.nsf_hifigan
|
nsf_hifigan = voiceChangerParams.nsf_hifigan
|
||||||
|
crepe_onnx_full = voiceChangerParams.crepe_onnx_full
|
||||||
|
crepe_onnx_tiny = voiceChangerParams.crepe_onnx_tiny
|
||||||
|
|
||||||
# file exists check (currently only for rvc)
|
# file exists check (currently only for rvc)
|
||||||
downloadParams = []
|
downloadParams = []
|
||||||
@ -57,6 +59,24 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if os.path.exists(crepe_onnx_full) is False:
|
||||||
|
downloadParams.append(
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/full.onnx",
|
||||||
|
"saveTo": crepe_onnx_full,
|
||||||
|
"position": 5,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.path.exists(crepe_onnx_tiny) is False:
|
||||||
|
downloadParams.append(
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/tiny.onnx",
|
||||||
|
"saveTo": crepe_onnx_tiny,
|
||||||
|
"position": 6,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
with ThreadPoolExecutor() as pool:
|
with ThreadPoolExecutor() as pool:
|
||||||
pool.map(download, downloadParams)
|
pool.map(download, downloadParams)
|
||||||
|
|
||||||
|
@ -300,5 +300,5 @@ class MMVCv15:
|
|||||||
if file_path.find(remove_path + os.path.sep) >= 0:
|
if file_path.find(remove_path + os.path.sep) >= 0:
|
||||||
# print("remove", key, file_path)
|
# print("remove", key, file_path)
|
||||||
sys.modules.pop(key)
|
sys.modules.pop(key)
|
||||||
except: # type:ignore
|
except: # NOQA
|
||||||
pass
|
pass
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import sys
|
# import sys
|
||||||
import os
|
# import os
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -7,18 +7,18 @@ import torchaudio
|
|||||||
from data.ModelSlot import RVCModelSlot
|
from data.ModelSlot import RVCModelSlot
|
||||||
|
|
||||||
|
|
||||||
# avoiding parse arg error in RVC
|
# # avoiding parse arg error in RVC
|
||||||
sys.argv = ["MMVCServerSIO.py"]
|
# sys.argv = ["MMVCServerSIO.py"]
|
||||||
|
|
||||||
if sys.platform.startswith("darwin"):
|
# if sys.platform.startswith("darwin"):
|
||||||
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
# baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
||||||
if len(baseDir) != 1:
|
# if len(baseDir) != 1:
|
||||||
print("baseDir should be only one ", baseDir)
|
# print("baseDir should be only one ", baseDir)
|
||||||
sys.exit()
|
# sys.exit()
|
||||||
modulePath = os.path.join(baseDir[0], "RVC")
|
# modulePath = os.path.join(baseDir[0], "RVC")
|
||||||
sys.path.append(modulePath)
|
# sys.path.append(modulePath)
|
||||||
else:
|
# else:
|
||||||
sys.path.append("RVC")
|
# sys.path.append("RVC")
|
||||||
|
|
||||||
|
|
||||||
from voice_changer.RVC.RVCSettings import RVCSettings
|
from voice_changer.RVC.RVCSettings import RVCSettings
|
||||||
@ -39,9 +39,10 @@ class RVC(VoiceChangerModel):
|
|||||||
print("[Voice Changer] [RVC] Creating instance ")
|
print("[Voice Changer] [RVC] Creating instance ")
|
||||||
self.deviceManager = DeviceManager.get_instance()
|
self.deviceManager = DeviceManager.get_instance()
|
||||||
EmbedderManager.initialize(params)
|
EmbedderManager.initialize(params)
|
||||||
|
PitchExtractorManager.initialize(params)
|
||||||
self.settings = RVCSettings()
|
self.settings = RVCSettings()
|
||||||
self.params = params
|
self.params = params
|
||||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
|
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||||
|
|
||||||
self.pipeline: Pipeline | None = None
|
self.pipeline: Pipeline | None = None
|
||||||
|
|
||||||
@ -76,7 +77,7 @@ class RVC(VoiceChangerModel):
|
|||||||
elif key in self.settings.strData:
|
elif key in self.settings.strData:
|
||||||
setattr(self.settings, key, str(val))
|
setattr(self.settings, key, str(val))
|
||||||
if key == "f0Detector" and self.pipeline is not None:
|
if key == "f0Detector" and self.pipeline is not None:
|
||||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
|
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||||
self.pipeline.setPitchExtractor(pitchExtractor)
|
self.pipeline.setPitchExtractor(pitchExtractor)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
@ -112,7 +113,7 @@ class RVC(VoiceChangerModel):
|
|||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
if self.slotInfo.f0:
|
if self.slotInfo.f0:
|
||||||
self.pitchf_buffer = np.zeros(new_feature_length)
|
self.pitchf_buffer = np.zeros(new_feature_length)
|
||||||
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
||||||
|
|
||||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||||
|
|
||||||
@ -201,21 +202,21 @@ class RVC(VoiceChangerModel):
|
|||||||
def __del__(self):
|
def __del__(self):
|
||||||
del self.pipeline
|
del self.pipeline
|
||||||
|
|
||||||
print("---------- REMOVING ---------------")
|
# print("---------- REMOVING ---------------")
|
||||||
|
|
||||||
remove_path = os.path.join("RVC")
|
# remove_path = os.path.join("RVC")
|
||||||
sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
|
# sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
|
||||||
|
|
||||||
for key in list(sys.modules):
|
# for key in list(sys.modules):
|
||||||
val = sys.modules.get(key)
|
# val = sys.modules.get(key)
|
||||||
try:
|
# try:
|
||||||
file_path = val.__file__
|
# file_path = val.__file__
|
||||||
if file_path.find("RVC" + os.path.sep) >= 0:
|
# if file_path.find("RVC" + os.path.sep) >= 0:
|
||||||
# print("remove", key, file_path)
|
# # print("remove", key, file_path)
|
||||||
sys.modules.pop(key)
|
# sys.modules.pop(key)
|
||||||
except Exception: # type:ignore
|
# except Exception: # type:ignore
|
||||||
# print(e)
|
# # print(e)
|
||||||
pass
|
# pass
|
||||||
|
|
||||||
def export2onnx(self):
|
def export2onnx(self):
|
||||||
modelSlot = self.slotInfo
|
modelSlot = self.slotInfo
|
||||||
|
@ -51,7 +51,7 @@ class OnnxRVCInferencer(Inferencer):
|
|||||||
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
||||||
"pitch": pitch.cpu().numpy().astype(np.int64),
|
"pitch": pitch.cpu().numpy().astype(np.int64),
|
||||||
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
||||||
"sid": sid.cpu().numpy().astype(np.int64)
|
"sid": sid.cpu().numpy().astype(np.int64)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -62,7 +62,7 @@ class OnnxRVCInferencer(Inferencer):
|
|||||||
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
||||||
"pitch": pitch.cpu().numpy().astype(np.int64),
|
"pitch": pitch.cpu().numpy().astype(np.int64),
|
||||||
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
||||||
"sid": sid.cpu().numpy().astype(np.int64)
|
"sid": sid.cpu().numpy().astype(np.int64)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@ from const import EnumInferenceTypes
|
|||||||
|
|
||||||
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
|
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
|
||||||
|
|
||||||
|
|
||||||
class OnnxRVCInferencerNono(OnnxRVCInferencer):
|
class OnnxRVCInferencerNono(OnnxRVCInferencer):
|
||||||
def loadModel(self, file: str, gpu: int):
|
def loadModel(self, file: str, gpu: int):
|
||||||
super().loadModel(file, gpu)
|
super().loadModel(file, gpu)
|
||||||
|
@ -73,9 +73,9 @@ class Pipeline(object):
|
|||||||
def exec(
|
def exec(
|
||||||
self,
|
self,
|
||||||
sid,
|
sid,
|
||||||
audio, # torch.tensor [n]
|
audio, # torch.tensor [n]
|
||||||
pitchf, # np.array [m]
|
pitchf, # np.array [m]
|
||||||
feature, # np.array [m, feat]
|
feature, # np.array [m, feat]
|
||||||
f0_up_key,
|
f0_up_key,
|
||||||
index_rate,
|
index_rate,
|
||||||
if_f0,
|
if_f0,
|
||||||
@ -208,13 +208,12 @@ class Pipeline(object):
|
|||||||
# apply silent front for inference
|
# apply silent front for inference
|
||||||
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||||
npyOffset = math.floor(silence_front * 16000) // 360
|
npyOffset = math.floor(silence_front * 16000) // 360
|
||||||
feats = feats[:, npyOffset * 2 :, :]
|
feats = feats[:, npyOffset * 2 :, :] # NOQA
|
||||||
feats_len = feats.shape[1]
|
feats_len = feats.shape[1]
|
||||||
if pitch is not None and pitchf is not None:
|
if pitch is not None and pitchf is not None:
|
||||||
pitch = pitch[:, -feats_len:]
|
pitch = pitch[:, -feats_len:]
|
||||||
pitchf = pitchf[:, -feats_len:]
|
pitchf = pitchf[:, -feats_len:]
|
||||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
p_len = torch.tensor([feats_len], device=self.device).long()
|
||||||
|
|
||||||
|
|
||||||
# 推論実行
|
# 推論実行
|
||||||
try:
|
try:
|
||||||
|
@ -34,7 +34,7 @@ def createPipeline(modelSlot: RVCModelSlot, gpu: int, f0Detector: str):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
# pitchExtractor
|
# pitchExtractor
|
||||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector)
|
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
||||||
|
|
||||||
# index, feature
|
# index, feature
|
||||||
index = _loadIndex(modelSlot)
|
index = _loadIndex(modelSlot)
|
||||||
@ -67,7 +67,7 @@ def _loadIndex(modelSlot: RVCModelSlot):
|
|||||||
try:
|
try:
|
||||||
print("Try loading...", modelSlot.indexFile)
|
print("Try loading...", modelSlot.indexFile)
|
||||||
index = faiss.read_index(modelSlot.indexFile)
|
index = faiss.read_index(modelSlot.indexFile)
|
||||||
except:
|
except: # NOQA
|
||||||
print("[Voice Changer] load index failed. Use no index.")
|
print("[Voice Changer] load index failed. Use no index.")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return None
|
return None
|
||||||
|
@ -0,0 +1,68 @@
|
|||||||
|
import numpy as np
|
||||||
|
from const import PitchExtractorType
|
||||||
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
import onnxruntime
|
||||||
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
|
||||||
|
|
||||||
|
class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||||
|
|
||||||
|
def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
|
||||||
|
self.pitchExtractorType = pitchExtractorType
|
||||||
|
super().__init__()
|
||||||
|
(
|
||||||
|
onnxProviders,
|
||||||
|
onnxProviderOptions,
|
||||||
|
) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
|
||||||
|
|
||||||
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
|
file, providers=onnxProviders, provider_options=onnxProviderOptions
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
|
n_frames = int(len(audio) // window) + 1
|
||||||
|
start_frame = int(silence_front * sr / window)
|
||||||
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||||
|
audio = audio[silence_front_offset:]
|
||||||
|
|
||||||
|
f0_min = 50
|
||||||
|
f0_max = 1100
|
||||||
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||||
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
|
|
||||||
|
precision = 10.0
|
||||||
|
|
||||||
|
audio_num = audio.cpu()
|
||||||
|
onnx_f0, onnx_pd = onnxcrepe.predict(
|
||||||
|
self.onnx_session,
|
||||||
|
audio_num,
|
||||||
|
sr,
|
||||||
|
precision=precision,
|
||||||
|
fmin=f0_min,
|
||||||
|
fmax=f0_max,
|
||||||
|
batch_size=256,
|
||||||
|
return_periodicity=True,
|
||||||
|
decoder=onnxcrepe.decode.weighted_argmax,
|
||||||
|
)
|
||||||
|
|
||||||
|
f0 = onnxcrepe.filter.median(onnx_f0, 3)
|
||||||
|
pd = onnxcrepe.filter.median(onnx_pd, 3)
|
||||||
|
|
||||||
|
f0[pd < 0.1] = 0
|
||||||
|
f0 = f0.squeeze()
|
||||||
|
|
||||||
|
f0 = np.pad(f0, (start_frame, n_frames - f0.shape[0] - start_frame), 'constant', constant_values=(0, 0))
|
||||||
|
|
||||||
|
f0 *= pow(2, f0_up_key / 12)
|
||||||
|
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||||
|
f0bak = pitchf.copy()
|
||||||
|
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
|
||||||
|
f0_mel = np.clip(
|
||||||
|
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
|
||||||
|
)
|
||||||
|
pitch_coarse = f0_mel.astype(int)
|
||||||
|
|
||||||
|
return pitch_coarse, pitchf
|
@ -1,16 +1,16 @@
|
|||||||
import torchcrepe
|
import torchcrepe
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from const import EnumPitchExtractorTypes
|
from const import PitchExtractorType
|
||||||
|
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
|
||||||
|
|
||||||
class CrepePitchExtractor(PitchExtractor):
|
class CrepePitchExtractor(PitchExtractor):
|
||||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.crepe
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.pitchExtractorType: PitchExtractorType = "crepe"
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||||
else:
|
else:
|
||||||
|
@ -1,16 +1,19 @@
|
|||||||
import pyworld
|
import pyworld
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from const import EnumPitchExtractorTypes
|
from const import PitchExtractorType
|
||||||
|
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
|
||||||
|
|
||||||
class DioPitchExtractor(PitchExtractor):
|
class DioPitchExtractor(PitchExtractor):
|
||||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.pitchExtractorType: PitchExtractorType = "dio"
|
||||||
|
|
||||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
audio = audio.detach().cpu().numpy()
|
audio = audio.detach().cpu().numpy()
|
||||||
n_frames = int(len(audio) // window) + 1
|
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||||
start_frame = int(silence_front * sr / window)
|
start_frame = int(silence_front * sr / window)
|
||||||
real_silence_front = start_frame * window / sr
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
@ -31,7 +34,7 @@ class DioPitchExtractor(PitchExtractor):
|
|||||||
frame_period=10,
|
frame_period=10,
|
||||||
)
|
)
|
||||||
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
|
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
|
||||||
# f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
|
# f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
|
||||||
|
|
||||||
f0 *= pow(2, f0_up_key / 12)
|
f0 *= pow(2, f0_up_key / 12)
|
||||||
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||||
@ -45,4 +48,3 @@ class DioPitchExtractor(PitchExtractor):
|
|||||||
pitch_coarse = np.rint(f0_mel).astype(int)
|
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||||
|
|
||||||
return pitch_coarse, pitchf
|
return pitch_coarse, pitchf
|
||||||
|
|
||||||
|
@ -1,17 +1,20 @@
|
|||||||
import pyworld
|
import pyworld
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.signal as signal
|
import scipy.signal as signal
|
||||||
from const import EnumPitchExtractorTypes
|
from const import PitchExtractorType
|
||||||
|
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
|
||||||
|
|
||||||
class HarvestPitchExtractor(PitchExtractor):
|
class HarvestPitchExtractor(PitchExtractor):
|
||||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.pitchExtractorType: PitchExtractorType = "harvest"
|
||||||
|
|
||||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
audio = audio.detach().cpu().numpy()
|
audio = audio.detach().cpu().numpy()
|
||||||
n_frames = int(len(audio) // window) + 1
|
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||||
start_frame = int(silence_front * sr / window)
|
start_frame = int(silence_front * sr / window)
|
||||||
real_silence_front = start_frame * window / sr
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
from const import EnumPitchExtractorTypes
|
|
||||||
|
|
||||||
|
|
||||||
class PitchExtractor(Protocol):
|
class PitchExtractor(Protocol):
|
||||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
|
|
||||||
|
|
||||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||||
...
|
...
|
||||||
|
|
||||||
def getPitchExtractorInfo(self):
|
def getPitchExtractorInfo(self):
|
||||||
return {
|
return {
|
||||||
"pitchExtractorType": self.pitchExtractorType.value,
|
"pitchExtractorType": self.pitchExtractorType,
|
||||||
}
|
}
|
||||||
|
@ -1,40 +1,42 @@
|
|||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
from const import EnumPitchExtractorTypes
|
from const import PitchExtractorType
|
||||||
|
from voice_changer.RVC.pitchExtractor.CrepeOnnxPitchExtractor import CrepeOnnxPitchExtractor
|
||||||
from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
|
from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
|
||||||
from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
|
from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
|
||||||
from voice_changer.RVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
|
from voice_changer.RVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
|
|
||||||
|
|
||||||
class PitchExtractorManager(Protocol):
|
class PitchExtractorManager(Protocol):
|
||||||
currentPitchExtractor: PitchExtractor | None = None
|
currentPitchExtractor: PitchExtractor | None = None
|
||||||
|
params: VoiceChangerParams
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def initialize(cls, params: VoiceChangerParams):
|
||||||
|
cls.params = params
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getPitchExtractor(
|
def getPitchExtractor(
|
||||||
cls, pitchExtractorType: EnumPitchExtractorTypes
|
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||||
) -> PitchExtractor:
|
) -> PitchExtractor:
|
||||||
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType)
|
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType, gpu)
|
||||||
return cls.currentPitchExtractor
|
return cls.currentPitchExtractor
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def loadPitchExtractor(
|
def loadPitchExtractor(
|
||||||
cls, pitchExtractorType: EnumPitchExtractorTypes
|
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||||
) -> PitchExtractor:
|
) -> PitchExtractor:
|
||||||
if (
|
if pitchExtractorType == "harvest":
|
||||||
pitchExtractorType == EnumPitchExtractorTypes.harvest
|
|
||||||
or pitchExtractorType == EnumPitchExtractorTypes.harvest.value
|
|
||||||
):
|
|
||||||
return HarvestPitchExtractor()
|
return HarvestPitchExtractor()
|
||||||
elif (
|
elif pitchExtractorType == "dio":
|
||||||
pitchExtractorType == EnumPitchExtractorTypes.dio
|
|
||||||
or pitchExtractorType == EnumPitchExtractorTypes.dio.value
|
|
||||||
):
|
|
||||||
return DioPitchExtractor()
|
return DioPitchExtractor()
|
||||||
elif (
|
elif pitchExtractorType == "crepe":
|
||||||
pitchExtractorType == EnumPitchExtractorTypes.crepe
|
|
||||||
or pitchExtractorType == EnumPitchExtractorTypes.crepe.value
|
|
||||||
):
|
|
||||||
return CrepePitchExtractor()
|
return CrepePitchExtractor()
|
||||||
|
elif pitchExtractorType == "crepe_tiny":
|
||||||
|
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
|
||||||
|
elif pitchExtractorType == "crepe_full":
|
||||||
|
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
|
||||||
else:
|
else:
|
||||||
# return hubert as default
|
# return hubert as default
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
@ -0,0 +1,8 @@
|
|||||||
|
from . import decode # NOQA
|
||||||
|
from .core import * # NOQA
|
||||||
|
from . import convert # NOQA
|
||||||
|
from . import filter # NOQA
|
||||||
|
from . import load # NOQA
|
||||||
|
from . import loudness # NOQA
|
||||||
|
from .session import CrepeInferenceSession # NOQA
|
||||||
|
from . import threshold # NOQA
|
57
server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
Normal file
57
server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import numpy as np
|
||||||
|
import scipy
|
||||||
|
|
||||||
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Pitch unit conversions
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def bins_to_cents(bins, apply_dither=False):
|
||||||
|
"""Converts pitch bins to cents"""
|
||||||
|
cents = onnxcrepe.CENTS_PER_BIN * bins + 1997.3794084376191
|
||||||
|
|
||||||
|
# Trade quantization error for noise (disabled by default)
|
||||||
|
return dither(cents) if apply_dither else cents
|
||||||
|
|
||||||
|
|
||||||
|
def bins_to_frequency(bins, apply_dither=False):
|
||||||
|
"""Converts pitch bins to frequency in Hz"""
|
||||||
|
return cents_to_frequency(bins_to_cents(bins, apply_dither=apply_dither))
|
||||||
|
|
||||||
|
|
||||||
|
def cents_to_bins(cents, quantize_fn=np.floor):
|
||||||
|
"""Converts cents to pitch bins"""
|
||||||
|
bins = (cents - 1997.3794084376191) / onnxcrepe.CENTS_PER_BIN
|
||||||
|
return quantize_fn(bins).astype(np.int64)
|
||||||
|
|
||||||
|
|
||||||
|
def cents_to_frequency(cents):
|
||||||
|
"""Converts cents to frequency in Hz"""
|
||||||
|
return 10 * 2 ** (cents / 1200)
|
||||||
|
|
||||||
|
|
||||||
|
def frequency_to_bins(frequency, quantize_fn=np.floor):
|
||||||
|
"""Convert frequency in Hz to pitch bins"""
|
||||||
|
return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
|
||||||
|
|
||||||
|
|
||||||
|
def frequency_to_cents(frequency):
|
||||||
|
"""Convert frequency in Hz to cents"""
|
||||||
|
return 1200 * np.log2(frequency / 10.)
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Utilities
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def dither(cents):
|
||||||
|
"""Dither the predicted pitch in cents to remove quantization error"""
|
||||||
|
noise = scipy.stats.triang.rvs(c=0.5,
|
||||||
|
loc=-onnxcrepe.CENTS_PER_BIN,
|
||||||
|
scale=2 * onnxcrepe.CENTS_PER_BIN,
|
||||||
|
size=cents.shape)
|
||||||
|
return cents + noise.astype(cents.dtype)
|
256
server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
Normal file
256
server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
Normal file
@ -0,0 +1,256 @@
|
|||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
|
||||||
|
__all__ = ['CENTS_PER_BIN',
|
||||||
|
'MAX_FMAX',
|
||||||
|
'PITCH_BINS',
|
||||||
|
'SAMPLE_RATE',
|
||||||
|
'WINDOW_SIZE',
|
||||||
|
'UNVOICED',
|
||||||
|
'predict',
|
||||||
|
'preprocess',
|
||||||
|
'infer',
|
||||||
|
'postprocess',
|
||||||
|
'resample']
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Constants
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
CENTS_PER_BIN = 20 # cents
|
||||||
|
MAX_FMAX = 2006. # hz
|
||||||
|
PITCH_BINS = 360
|
||||||
|
SAMPLE_RATE = 16000 # hz
|
||||||
|
WINDOW_SIZE = 1024 # samples
|
||||||
|
UNVOICED = np.nan
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Crepe pitch prediction
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def predict(session,
|
||||||
|
audio,
|
||||||
|
sample_rate,
|
||||||
|
precision=None,
|
||||||
|
fmin=50.,
|
||||||
|
fmax=MAX_FMAX,
|
||||||
|
decoder=onnxcrepe.decode.weighted_viterbi,
|
||||||
|
return_periodicity=False,
|
||||||
|
batch_size=None,
|
||||||
|
pad=True):
|
||||||
|
"""Performs pitch estimation
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
session (onnxcrepe.CrepeInferenceSession)
|
||||||
|
An onnxruntime.InferenceSession holding the CREPE model
|
||||||
|
audio (numpy.ndarray [shape=(n_samples,)])
|
||||||
|
The audio signal
|
||||||
|
sample_rate (int)
|
||||||
|
The sampling rate in Hz
|
||||||
|
precision (float)
|
||||||
|
The precision in milliseconds, i.e. the length of each frame
|
||||||
|
fmin (float)
|
||||||
|
The minimum allowable frequency in Hz
|
||||||
|
fmax (float)
|
||||||
|
The maximum allowable frequency in Hz
|
||||||
|
decoder (function)
|
||||||
|
The decoder to use. See decode.py for decoders.
|
||||||
|
return_periodicity (bool)
|
||||||
|
Whether to also return the network confidence
|
||||||
|
batch_size (int)
|
||||||
|
The number of frames per batch
|
||||||
|
pad (bool)
|
||||||
|
Whether to zero-pad the audio
|
||||||
|
|
||||||
|
Returns
|
||||||
|
pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
|
||||||
|
(Optional) periodicity (numpy.ndarray
|
||||||
|
[shape=(1, 1 + int(time // precision))])
|
||||||
|
"""
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Preprocess audio
|
||||||
|
generator = preprocess(audio,
|
||||||
|
sample_rate,
|
||||||
|
precision,
|
||||||
|
batch_size,
|
||||||
|
pad)
|
||||||
|
for frames in generator:
|
||||||
|
|
||||||
|
# Infer independent probabilities for each pitch bin
|
||||||
|
probabilities = infer(session, frames) # shape=(batch, 360)
|
||||||
|
|
||||||
|
probabilities = probabilities.transpose(1, 0)[None] # shape=(1, 360, batch)
|
||||||
|
|
||||||
|
# Convert probabilities to F0 and periodicity
|
||||||
|
result = postprocess(probabilities,
|
||||||
|
fmin,
|
||||||
|
fmax,
|
||||||
|
decoder,
|
||||||
|
return_periodicity)
|
||||||
|
|
||||||
|
# Place on same device as audio to allow very long inputs
|
||||||
|
if isinstance(result, tuple):
|
||||||
|
result = (result[0], result[1])
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# Split pitch and periodicity
|
||||||
|
if return_periodicity:
|
||||||
|
pitch, periodicity = zip(*results)
|
||||||
|
return np.concatenate(pitch, axis=1), np.concatenate(periodicity, axis=1)
|
||||||
|
|
||||||
|
# Concatenate
|
||||||
|
return np.concatenate(results, axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(audio,
|
||||||
|
sample_rate,
|
||||||
|
precision=None,
|
||||||
|
batch_size=None,
|
||||||
|
pad=True):
|
||||||
|
"""Convert audio to model input
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
audio (numpy.ndarray [shape=(time,)])
|
||||||
|
The audio signals
|
||||||
|
sample_rate (int)
|
||||||
|
The sampling rate in Hz
|
||||||
|
precision (float)
|
||||||
|
The precision in milliseconds, i.e. the length of each frame
|
||||||
|
batch_size (int)
|
||||||
|
The number of frames per batch
|
||||||
|
pad (bool)
|
||||||
|
Whether to zero-pad the audio
|
||||||
|
|
||||||
|
Returns
|
||||||
|
frames (numpy.ndarray [shape=(1 + int(time // precision), 1024)])
|
||||||
|
"""
|
||||||
|
# Resample
|
||||||
|
if sample_rate != SAMPLE_RATE:
|
||||||
|
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
|
||||||
|
|
||||||
|
# Default hop length of 10 ms
|
||||||
|
hop_length = SAMPLE_RATE / 100 if precision is None else SAMPLE_RATE * precision / 1000
|
||||||
|
|
||||||
|
# Get total number of frames
|
||||||
|
|
||||||
|
# Maybe pad
|
||||||
|
if pad:
|
||||||
|
total_frames = 1 + int(audio.shape[0] / hop_length)
|
||||||
|
audio = np.pad(
|
||||||
|
audio,
|
||||||
|
(WINDOW_SIZE // 2, WINDOW_SIZE // 2))
|
||||||
|
else:
|
||||||
|
total_frames = 1 + int((audio.shape[0] - WINDOW_SIZE) / hop_length)
|
||||||
|
|
||||||
|
# Default to running all frames in a single batch
|
||||||
|
batch_size = total_frames if batch_size is None else batch_size
|
||||||
|
|
||||||
|
# Generate batches
|
||||||
|
for i in range(0, total_frames, batch_size):
|
||||||
|
# Batch indices
|
||||||
|
start = max(0, int(i * hop_length))
|
||||||
|
end = min(audio.shape[0],
|
||||||
|
int((i + batch_size - 1) * hop_length) + WINDOW_SIZE)
|
||||||
|
|
||||||
|
# Chunk
|
||||||
|
n_bytes = audio.strides[-1]
|
||||||
|
frames = np.lib.stride_tricks.as_strided(
|
||||||
|
audio[start:end],
|
||||||
|
shape=((end - start - WINDOW_SIZE) // int(hop_length) + 1, WINDOW_SIZE),
|
||||||
|
strides=(int(hop_length) * n_bytes, n_bytes)) # shape=(batch, 1024)
|
||||||
|
|
||||||
|
# Note:
|
||||||
|
# Z-score standardization operations originally located here
|
||||||
|
# (https://github.com/maxrmorrison/torchcrepe/blob/master/torchcrepe/core.py#L692)
|
||||||
|
# are wrapped into the ONNX models for hardware acceleration.
|
||||||
|
|
||||||
|
yield frames
|
||||||
|
|
||||||
|
|
||||||
|
def infer(session, frames):
|
||||||
|
"""Forward pass through the model
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
session (onnxcrepe.CrepeInferenceSession)
|
||||||
|
An onnxruntime.InferenceSession holding the CREPE model
|
||||||
|
frames (numpy.ndarray [shape=(time / precision, 1024)])
|
||||||
|
The network input
|
||||||
|
|
||||||
|
Returns
|
||||||
|
logits (numpy.ndarray [shape=(1 + int(time // precision), 360)])
|
||||||
|
"""
|
||||||
|
# Apply model
|
||||||
|
return session.run(None, {'frames': frames})[0]
|
||||||
|
|
||||||
|
|
||||||
|
def postprocess(probabilities,
|
||||||
|
fmin=0.,
|
||||||
|
fmax=MAX_FMAX,
|
||||||
|
decoder=onnxcrepe.decode.weighted_viterbi,
|
||||||
|
return_periodicity=False):
|
||||||
|
"""Convert model output to F0 and periodicity
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
probabilities (numpy.ndarray [shape=(1, 360, time / precision)])
|
||||||
|
The probabilities for each pitch bin inferred by the network
|
||||||
|
fmin (float)
|
||||||
|
The minimum allowable frequency in Hz
|
||||||
|
fmax (float)
|
||||||
|
The maximum allowable frequency in Hz
|
||||||
|
decoder (function)
|
||||||
|
The decoder to use. See decode.py for decoders.
|
||||||
|
return_periodicity (bool)
|
||||||
|
Whether to also return the network confidence
|
||||||
|
|
||||||
|
Returns
|
||||||
|
pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
|
||||||
|
periodicity (numpy.ndarray [shape=(1, 1 + int(time // precision))])
|
||||||
|
"""
|
||||||
|
# Convert frequency range to pitch bin range
|
||||||
|
minidx = onnxcrepe.convert.frequency_to_bins(fmin)
|
||||||
|
maxidx = onnxcrepe.convert.frequency_to_bins(fmax, np.ceil)
|
||||||
|
|
||||||
|
# Remove frequencies outside allowable range
|
||||||
|
probabilities[:, :minidx] = float('-inf')
|
||||||
|
probabilities[:, maxidx:] = float('-inf')
|
||||||
|
|
||||||
|
# Perform argmax or viterbi sampling
|
||||||
|
bins, pitch = decoder(probabilities)
|
||||||
|
|
||||||
|
if not return_periodicity:
|
||||||
|
return pitch
|
||||||
|
|
||||||
|
# Compute periodicity from probabilities and decoded pitch bins
|
||||||
|
return pitch, periodicity(probabilities, bins)
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Utilities
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def periodicity(probabilities, bins):
|
||||||
|
"""Computes the periodicity from the network output and pitch bins"""
|
||||||
|
# shape=(time / precision, 360)
|
||||||
|
probs_stacked = probabilities.transpose(0, 2, 1).reshape(-1, PITCH_BINS)
|
||||||
|
# shape=(time / precision, 1)
|
||||||
|
bins_stacked = bins.reshape(-1, 1).astype(np.int64)
|
||||||
|
|
||||||
|
# Use maximum logit over pitch bins as periodicity
|
||||||
|
periodicity = np.take_along_axis(probs_stacked, bins_stacked, axis=1)
|
||||||
|
|
||||||
|
# shape=(batch, time / precision)
|
||||||
|
return periodicity.reshape(probabilities.shape[0], probabilities.shape[2])
|
||||||
|
|
||||||
|
|
||||||
|
def resample(audio, sample_rate):
|
||||||
|
"""Resample audio"""
|
||||||
|
return librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
|
80
server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
Normal file
80
server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Probability sequence decoding methods
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def argmax(logits):
|
||||||
|
"""Sample observations by taking the argmax"""
|
||||||
|
bins = logits.argmax(axis=1)
|
||||||
|
|
||||||
|
# Convert to frequency in Hz
|
||||||
|
return bins, onnxcrepe.convert.bins_to_frequency(bins)
|
||||||
|
|
||||||
|
|
||||||
|
def weighted_argmax(logits: np.ndarray):
|
||||||
|
"""Sample observations using weighted sum near the argmax"""
|
||||||
|
# Find center of analysis window
|
||||||
|
bins = logits.argmax(axis=1)
|
||||||
|
|
||||||
|
return bins, _apply_weights(logits, bins)
|
||||||
|
|
||||||
|
|
||||||
|
def viterbi(logits):
|
||||||
|
"""Sample observations using viterbi decoding"""
|
||||||
|
# Create viterbi transition matrix
|
||||||
|
if not hasattr(viterbi, 'transition'):
|
||||||
|
xx, yy = np.meshgrid(range(360), range(360))
|
||||||
|
transition = np.maximum(12 - abs(xx - yy), 0)
|
||||||
|
transition = transition / transition.sum(axis=1, keepdims=True)
|
||||||
|
viterbi.transition = transition
|
||||||
|
|
||||||
|
# Normalize logits (softmax)
|
||||||
|
logits -= logits.max(axis=1)
|
||||||
|
exp = np.exp(logits)
|
||||||
|
probs = exp / np.sum(exp, axis=1)
|
||||||
|
|
||||||
|
# Perform viterbi decoding
|
||||||
|
bins = np.array([
|
||||||
|
librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
|
||||||
|
for sequence in probs])
|
||||||
|
|
||||||
|
# Convert to frequency in Hz
|
||||||
|
return bins, onnxcrepe.convert.bins_to_frequency(bins)
|
||||||
|
|
||||||
|
|
||||||
|
def weighted_viterbi(logits):
|
||||||
|
"""Sample observations combining viterbi decoding and weighted argmax"""
|
||||||
|
bins, _ = viterbi(logits)
|
||||||
|
|
||||||
|
return bins, _apply_weights(logits, bins)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_weights(logits, bins):
|
||||||
|
# Find bounds of analysis window
|
||||||
|
start = np.maximum(0, bins - 4)
|
||||||
|
end = np.minimum(logits.shape[1], bins + 5)
|
||||||
|
|
||||||
|
# Mask out everything outside of window
|
||||||
|
for batch in range(logits.shape[0]):
|
||||||
|
for time in range(logits.shape[2]):
|
||||||
|
logits[batch, :start[batch, time], time] = float('-inf')
|
||||||
|
logits[batch, end[batch, time]:, time] = float('-inf')
|
||||||
|
|
||||||
|
# Construct weights
|
||||||
|
if not hasattr(_apply_weights, 'weights'):
|
||||||
|
weights = onnxcrepe.convert.bins_to_cents(np.arange(360))
|
||||||
|
_apply_weights.weights = weights[None, :, None]
|
||||||
|
|
||||||
|
# Convert to probabilities (ReLU)
|
||||||
|
probs = np.maximum(0, logits)
|
||||||
|
|
||||||
|
# Apply weights
|
||||||
|
cents = (_apply_weights.weights * probs).sum(axis=1) / probs.sum(axis=1)
|
||||||
|
|
||||||
|
# Convert to frequency in Hz
|
||||||
|
return onnxcrepe.convert.cents_to_frequency(cents)
|
125
server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
Normal file
125
server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Sequence filters
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def mean(signals, win_length=9):
|
||||||
|
"""Averave filtering for signals containing nan values
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
signals (numpy.ndarray (shape=(batch, time)))
|
||||||
|
The signals to filter
|
||||||
|
win_length
|
||||||
|
The size of the analysis window
|
||||||
|
|
||||||
|
Returns
|
||||||
|
filtered (numpy.ndarray (shape=(batch, time)))
|
||||||
|
"""
|
||||||
|
return nanfilter(signals, win_length, nanmean)
|
||||||
|
|
||||||
|
|
||||||
|
def median(signals, win_length):
|
||||||
|
"""Median filtering for signals containing nan values
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
signals (numpy.ndarray (shape=(batch, time)))
|
||||||
|
The signals to filter
|
||||||
|
win_length
|
||||||
|
The size of the analysis window
|
||||||
|
|
||||||
|
Returns
|
||||||
|
filtered (numpy.ndarray (shape=(batch, time)))
|
||||||
|
"""
|
||||||
|
return nanfilter(signals, win_length, nanmedian)
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Utilities
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def nanfilter(signals, win_length, filter_fn):
|
||||||
|
"""Filters a sequence, ignoring nan values
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
signals (numpy.ndarray (shape=(batch, time)))
|
||||||
|
The signals to filter
|
||||||
|
win_length
|
||||||
|
The size of the analysis window
|
||||||
|
filter_fn (function)
|
||||||
|
The function to use for filtering
|
||||||
|
|
||||||
|
Returns
|
||||||
|
filtered (numpy.ndarray (shape=(batch, time)))
|
||||||
|
"""
|
||||||
|
# Output buffer
|
||||||
|
filtered = np.empty_like(signals)
|
||||||
|
|
||||||
|
# Loop over frames
|
||||||
|
for i in range(signals.shape[1]):
|
||||||
|
|
||||||
|
# Get analysis window bounds
|
||||||
|
start = max(0, i - win_length // 2)
|
||||||
|
end = min(signals.shape[1], i + win_length // 2 + 1)
|
||||||
|
|
||||||
|
# Apply filter to window
|
||||||
|
filtered[:, i] = filter_fn(signals[:, start:end])
|
||||||
|
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
def nanmean(signals):
|
||||||
|
"""Computes the mean, ignoring nans
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
signals (numpy.ndarray [shape=(batch, time)])
|
||||||
|
The signals to filter
|
||||||
|
|
||||||
|
Returns
|
||||||
|
filtered (numpy.ndarray [shape=(batch, time)])
|
||||||
|
"""
|
||||||
|
signals = signals.clone()
|
||||||
|
|
||||||
|
# Find nans
|
||||||
|
nans = np.isnan(signals)
|
||||||
|
|
||||||
|
# Set nans to 0.
|
||||||
|
signals[nans] = 0.
|
||||||
|
|
||||||
|
# Compute average
|
||||||
|
return signals.sum(axis=1) / (~nans).astype(np.float32).sum(axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def nanmedian(signals):
|
||||||
|
"""Computes the median, ignoring nans
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
signals (numpy.ndarray [shape=(batch, time)])
|
||||||
|
The signals to filter
|
||||||
|
|
||||||
|
Returns
|
||||||
|
filtered (numpy.ndarray [shape=(batch, time)])
|
||||||
|
"""
|
||||||
|
# Find nans
|
||||||
|
nans = np.isnan(signals)
|
||||||
|
|
||||||
|
# Compute median for each slice
|
||||||
|
medians = [nanmedian1d(signal[~nan]) for signal, nan in zip(signals, nans)]
|
||||||
|
|
||||||
|
# Stack results
|
||||||
|
return np.array(medians, dtype=signals.dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def nanmedian1d(signal):
|
||||||
|
"""Computes the median. If signal is empty, returns torch.nan
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
signal (numpy.ndarray [shape=(time,)])
|
||||||
|
|
||||||
|
Returns
|
||||||
|
median (numpy.ndarray [shape=(1,)])
|
||||||
|
"""
|
||||||
|
return np.median(signal) if signal.size else np.nan
|
12
server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
Normal file
12
server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def audio(filename):
|
||||||
|
"""Load audio from disk"""
|
||||||
|
samples, sr = librosa.load(filename, sr=None)
|
||||||
|
if len(samples.shape) > 1:
|
||||||
|
# To mono
|
||||||
|
samples = np.mean(samples, axis=1)
|
||||||
|
|
||||||
|
return samples, sr
|
@ -0,0 +1,73 @@
|
|||||||
|
import warnings
|
||||||
|
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Constants
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
# Minimum decibel level
|
||||||
|
MIN_DB = -100.
|
||||||
|
|
||||||
|
# Reference decibel level
|
||||||
|
REF_DB = 20.
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# A-weighted loudness
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def a_weighted(audio, sample_rate, hop_length=None, pad=True):
|
||||||
|
"""Retrieve the per-frame loudness"""
|
||||||
|
|
||||||
|
# Default hop length of 10 ms
|
||||||
|
hop_length = sample_rate // 100 if hop_length is None else hop_length
|
||||||
|
|
||||||
|
# Convert to numpy
|
||||||
|
audio = audio.squeeze(0)
|
||||||
|
|
||||||
|
# Resample
|
||||||
|
if sample_rate != onnxcrepe.SAMPLE_RATE:
|
||||||
|
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
|
||||||
|
hop_length = int(hop_length * onnxcrepe.SAMPLE_RATE / sample_rate)
|
||||||
|
|
||||||
|
# Cache weights
|
||||||
|
if not hasattr(a_weighted, 'weights'):
|
||||||
|
a_weighted.weights = perceptual_weights()
|
||||||
|
|
||||||
|
# Take stft
|
||||||
|
stft = librosa.stft(audio,
|
||||||
|
n_fft=onnxcrepe.WINDOW_SIZE,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=onnxcrepe.WINDOW_SIZE,
|
||||||
|
center=pad,
|
||||||
|
pad_mode='constant')
|
||||||
|
|
||||||
|
# Compute magnitude on db scale
|
||||||
|
db = librosa.amplitude_to_db(np.abs(stft))
|
||||||
|
|
||||||
|
# Apply A-weighting
|
||||||
|
weighted = db + a_weighted.weights
|
||||||
|
|
||||||
|
# Threshold
|
||||||
|
weighted[weighted < MIN_DB] = MIN_DB
|
||||||
|
|
||||||
|
# Average over weighted frequencies
|
||||||
|
return weighted.mean(axis=0).astype(np.float32)[None]
|
||||||
|
|
||||||
|
|
||||||
|
def perceptual_weights():
|
||||||
|
"""A-weighted frequency-dependent perceptual loudness weights"""
|
||||||
|
frequencies = librosa.fft_frequencies(sr=onnxcrepe.SAMPLE_RATE,
|
||||||
|
n_fft=onnxcrepe.WINDOW_SIZE)
|
||||||
|
|
||||||
|
# A warning is raised for nearly inaudible frequencies, but it ends up
|
||||||
|
# defaulting to -100 db. That default is fine for our purposes.
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter('ignore', RuntimeWarning)
|
||||||
|
return librosa.A_weighting(frequencies)[:, None] - REF_DB
|
@ -0,0 +1 @@
|
|||||||
|
modules in this folder from https://github.com/yqzhishen/onnxcrepe at ca7e5d7f2dfca5cc4d99e8d546b00793ca4e7157
|
@ -0,0 +1,9 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import onnxruntime as ort
|
||||||
|
|
||||||
|
|
||||||
|
class CrepeInferenceSession(ort.InferenceSession):
|
||||||
|
def __init__(self, model='full', sess_options=None, providers=None, provider_options=None, **kwargs):
|
||||||
|
model_path = os.path.join(os.path.dirname(__file__), 'assets', f'{model}.onnx')
|
||||||
|
super().__init__(model_path, sess_options, providers, provider_options, **kwargs)
|
129
server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
Normal file
129
server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Pitch thresholding methods
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
class At:
|
||||||
|
"""Simple thresholding at a specified probability value"""
|
||||||
|
|
||||||
|
def __init__(self, value):
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
def __call__(self, pitch, periodicity):
|
||||||
|
# Make a copy to prevent in-place modification
|
||||||
|
pitch = pitch.copy()
|
||||||
|
|
||||||
|
# Threshold
|
||||||
|
pitch[periodicity < self.value] = onnxcrepe.UNVOICED
|
||||||
|
return pitch
|
||||||
|
|
||||||
|
|
||||||
|
class Hysteresis:
|
||||||
|
"""Hysteresis thresholding"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
lower_bound=.19,
|
||||||
|
upper_bound=.31,
|
||||||
|
width=.2,
|
||||||
|
stds=1.7,
|
||||||
|
return_threshold=False):
|
||||||
|
self.lower_bound = lower_bound
|
||||||
|
self.upper_bound = upper_bound
|
||||||
|
self.width = width
|
||||||
|
self.stds = stds
|
||||||
|
self.return_threshold = return_threshold
|
||||||
|
|
||||||
|
def __call__(self, pitch, periodicity):
|
||||||
|
|
||||||
|
# Perform hysteresis in log-2 space
|
||||||
|
pitch = np.log2(pitch).flatten()
|
||||||
|
|
||||||
|
# Flatten periodicity
|
||||||
|
periodicity = periodicity.flatten()
|
||||||
|
|
||||||
|
# Ignore confidently unvoiced pitch
|
||||||
|
pitch[periodicity < self.lower_bound] = onnxcrepe.UNVOICED
|
||||||
|
|
||||||
|
# Whiten pitch
|
||||||
|
mean, std = np.nanmean(pitch), np.nanstd(pitch)
|
||||||
|
pitch = (pitch - mean) / std
|
||||||
|
|
||||||
|
# Require high confidence to make predictions far from the mean
|
||||||
|
parabola = self.width * pitch ** 2 - self.width * self.stds ** 2
|
||||||
|
threshold = self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound)
|
||||||
|
threshold[np.isnan(threshold)] = self.lower_bound
|
||||||
|
|
||||||
|
# Apply hysteresis to prevent short, unconfident voiced regions
|
||||||
|
i = 0
|
||||||
|
while i < len(periodicity) - 1:
|
||||||
|
|
||||||
|
# Detect unvoiced to voiced transition
|
||||||
|
if periodicity[i] < threshold[i] and periodicity[i + 1] > threshold[i + 1]:
|
||||||
|
|
||||||
|
# Grow region until next unvoiced or end of array
|
||||||
|
start, end, keep = i + 1, i + 1, False
|
||||||
|
while end < len(periodicity) and periodicity[end] > threshold[end]:
|
||||||
|
if periodicity[end] > self.upper_bound:
|
||||||
|
keep = True
|
||||||
|
end += 1
|
||||||
|
|
||||||
|
# Force unvoiced if we didn't pass the confidence required by
|
||||||
|
# the hysteresis
|
||||||
|
if not keep:
|
||||||
|
threshold[start:end] = 1
|
||||||
|
|
||||||
|
i = end
|
||||||
|
|
||||||
|
else:
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Remove pitch with low periodicity
|
||||||
|
pitch[periodicity < threshold] = onnxcrepe.UNVOICED
|
||||||
|
|
||||||
|
# Unwhiten
|
||||||
|
pitch = pitch * std + mean
|
||||||
|
|
||||||
|
# Convert to Hz
|
||||||
|
pitch = np.array(2 ** pitch)[None, :]
|
||||||
|
|
||||||
|
# Optionally return threshold
|
||||||
|
if self.return_threshold:
|
||||||
|
return pitch, np.array(threshold)
|
||||||
|
|
||||||
|
return pitch
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Periodicity thresholding methods
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
class Silence:
|
||||||
|
"""Set periodicity to zero in silent regions"""
|
||||||
|
|
||||||
|
def __init__(self, value=-60):
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
def __call__(self,
|
||||||
|
periodicity,
|
||||||
|
audio,
|
||||||
|
sample_rate=onnxcrepe.SAMPLE_RATE,
|
||||||
|
precision=None,
|
||||||
|
pad=True):
|
||||||
|
# Don't modify in-place
|
||||||
|
periodicity = periodicity.copy()
|
||||||
|
|
||||||
|
# Compute loudness
|
||||||
|
hop_length = sample_rate * precision // 1000
|
||||||
|
loudness = onnxcrepe.loudness.a_weighted(
|
||||||
|
audio, sample_rate, hop_length, pad)
|
||||||
|
|
||||||
|
# Threshold silence
|
||||||
|
periodicity[loudness < self.value] = 0.
|
||||||
|
|
||||||
|
return periodicity
|
@ -12,3 +12,5 @@ class VoiceChangerParams:
|
|||||||
hubert_soft: str
|
hubert_soft: str
|
||||||
nsf_hifigan: str
|
nsf_hifigan: str
|
||||||
sample_mode: str
|
sample_mode: str
|
||||||
|
crepe_onnx_full: str
|
||||||
|
crepe_onnx_tiny: str
|
||||||
|
Loading…
x
Reference in New Issue
Block a user