New Feature:
- Add Crepe Full/Tiny (onnx) - remove test connect for local Refactor: - RVC: comment out module importer
This commit is contained in:
parent
099f82cc60
commit
d5561c2212
@ -21,7 +21,7 @@
|
||||
{
|
||||
"name": "configArea",
|
||||
"options": {
|
||||
"detectors": ["dio", "harvest", "crepe"],
|
||||
"detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
|
||||
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
|
||||
}
|
||||
}
|
||||
|
11
client/demo/dist/index.html
vendored
11
client/demo/dist/index.html
vendored
@ -1 +1,10 @@
|
||||
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
|
||||
<!DOCTYPE html>
|
||||
<html style="width: 100%; height: 100%; overflow: hidden">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Voice Changer Client Demo</title>
|
||||
<script defer src="index.js"></script></head>
|
||||
<body style="width: 100%; height: 100%; margin: 0px">
|
||||
<div id="app" style="width: 100%; height: 100%"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
1125
client/demo/dist/index.js
vendored
1125
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
31
client/demo/dist/index.js.LICENSE.txt
vendored
31
client/demo/dist/index.js.LICENSE.txt
vendored
@ -1,31 +0,0 @@
|
||||
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react-dom.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* scheduler.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
@ -21,7 +21,7 @@
|
||||
{
|
||||
"name": "configArea",
|
||||
"options": {
|
||||
"detectors": ["dio", "harvest", "crepe"],
|
||||
"detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
|
||||
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
|
||||
}
|
||||
}
|
||||
|
@ -42,8 +42,9 @@ export type CrossFadeOverlapSize = typeof CrossFadeOverlapSize[keyof typeof Cros
|
||||
export const F0Detector = {
|
||||
"dio": "dio",
|
||||
"harvest": "harvest",
|
||||
// "parselmouth": "parselmouth",
|
||||
"crepe": "crepe",
|
||||
"crepe_full": "crepe_full",
|
||||
"crepe_tiny": "crepe_tiny",
|
||||
} as const
|
||||
export type F0Detector = typeof F0Detector[keyof typeof F0Detector]
|
||||
|
||||
|
@ -34,6 +34,7 @@ def setupArgParser():
|
||||
parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
|
||||
parser.add_argument("-p", type=int, default=18888, help="port")
|
||||
parser.add_argument("--https", type=strtobool, default=False, help="use https")
|
||||
parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8")
|
||||
parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
|
||||
parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
|
||||
parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
|
||||
@ -48,6 +49,8 @@ def setupArgParser():
|
||||
parser.add_argument("--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)")
|
||||
parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)")
|
||||
parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)")
|
||||
parser.add_argument("--crepe_onnx_full", type=str, help="path to crepe_onnx_full")
|
||||
parser.add_argument("--crepe_onnx_tiny", type=str, help="path to crepe_onnx_tiny")
|
||||
|
||||
return parser
|
||||
|
||||
@ -85,6 +88,9 @@ voiceChangerParams = VoiceChangerParams(
|
||||
hubert_base_jp=args.hubert_base_jp,
|
||||
hubert_soft=args.hubert_soft,
|
||||
nsf_hifigan=args.nsf_hifigan,
|
||||
crepe_onnx_full=args.crepe_onnx_full,
|
||||
crepe_onnx_tiny=args.crepe_onnx_tiny,
|
||||
|
||||
sample_mode=args.sample_mode,
|
||||
)
|
||||
|
||||
@ -120,6 +126,7 @@ if __name__ == "__mp_main__":
|
||||
if __name__ == "__main__":
|
||||
mp.freeze_support()
|
||||
|
||||
printMessage(f"PYTHON:{sys.version}", level=2)
|
||||
printMessage("Voice Changerを起動しています。", level=2)
|
||||
# ダウンロード(Weight)
|
||||
try:
|
||||
@ -195,10 +202,10 @@ if __name__ == "__main__":
|
||||
else:
|
||||
printMessage(f"http://localhost:{EX_PORT}/", level=1)
|
||||
else: # 直接python起動
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
hostname = s.getsockname()[0]
|
||||
if args.https == 1:
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.connect((args.test_connect, 80))
|
||||
hostname = s.getsockname()[0]
|
||||
printMessage(f"https://localhost:{PORT}/", level=1)
|
||||
printMessage(f"https://{hostname}:{PORT}/", level=1)
|
||||
else:
|
||||
|
@ -73,15 +73,13 @@ class EnumInferenceTypes(Enum):
|
||||
onnxRVCNono = "onnxRVCNono"
|
||||
|
||||
|
||||
class EnumPitchExtractorTypes(Enum):
|
||||
harvest = "harvest"
|
||||
dio = "dio"
|
||||
crepe = "crepe"
|
||||
|
||||
|
||||
class EnumFrameworkTypes(Enum):
|
||||
pyTorch = "pyTorch"
|
||||
onnx = "onnx"
|
||||
PitchExtractorType: TypeAlias = Literal[
|
||||
"harvest",
|
||||
"dio",
|
||||
"crepe",
|
||||
"crepe_full",
|
||||
"crepe_tiny",
|
||||
]
|
||||
|
||||
|
||||
class ServerAudioDeviceTypes(Enum):
|
||||
|
@ -11,6 +11,8 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
||||
hubert_base_jp = voiceChangerParams.hubert_base_jp
|
||||
hubert_soft = voiceChangerParams.hubert_soft
|
||||
nsf_hifigan = voiceChangerParams.nsf_hifigan
|
||||
crepe_onnx_full = voiceChangerParams.crepe_onnx_full
|
||||
crepe_onnx_tiny = voiceChangerParams.crepe_onnx_tiny
|
||||
|
||||
# file exists check (currently only for rvc)
|
||||
downloadParams = []
|
||||
@ -57,6 +59,24 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
||||
}
|
||||
)
|
||||
|
||||
if os.path.exists(crepe_onnx_full) is False:
|
||||
downloadParams.append(
|
||||
{
|
||||
"url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/full.onnx",
|
||||
"saveTo": crepe_onnx_full,
|
||||
"position": 5,
|
||||
}
|
||||
)
|
||||
|
||||
if os.path.exists(crepe_onnx_tiny) is False:
|
||||
downloadParams.append(
|
||||
{
|
||||
"url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/tiny.onnx",
|
||||
"saveTo": crepe_onnx_tiny,
|
||||
"position": 6,
|
||||
}
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor() as pool:
|
||||
pool.map(download, downloadParams)
|
||||
|
||||
|
@ -300,5 +300,5 @@ class MMVCv15:
|
||||
if file_path.find(remove_path + os.path.sep) >= 0:
|
||||
# print("remove", key, file_path)
|
||||
sys.modules.pop(key)
|
||||
except: # type:ignore
|
||||
except: # NOQA
|
||||
pass
|
||||
|
@ -1,5 +1,5 @@
|
||||
import sys
|
||||
import os
|
||||
# import sys
|
||||
# import os
|
||||
from dataclasses import asdict
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -7,18 +7,18 @@ import torchaudio
|
||||
from data.ModelSlot import RVCModelSlot
|
||||
|
||||
|
||||
# avoiding parse arg error in RVC
|
||||
sys.argv = ["MMVCServerSIO.py"]
|
||||
# # avoiding parse arg error in RVC
|
||||
# sys.argv = ["MMVCServerSIO.py"]
|
||||
|
||||
if sys.platform.startswith("darwin"):
|
||||
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
||||
if len(baseDir) != 1:
|
||||
print("baseDir should be only one ", baseDir)
|
||||
sys.exit()
|
||||
modulePath = os.path.join(baseDir[0], "RVC")
|
||||
sys.path.append(modulePath)
|
||||
else:
|
||||
sys.path.append("RVC")
|
||||
# if sys.platform.startswith("darwin"):
|
||||
# baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
||||
# if len(baseDir) != 1:
|
||||
# print("baseDir should be only one ", baseDir)
|
||||
# sys.exit()
|
||||
# modulePath = os.path.join(baseDir[0], "RVC")
|
||||
# sys.path.append(modulePath)
|
||||
# else:
|
||||
# sys.path.append("RVC")
|
||||
|
||||
|
||||
from voice_changer.RVC.RVCSettings import RVCSettings
|
||||
@ -39,9 +39,10 @@ class RVC(VoiceChangerModel):
|
||||
print("[Voice Changer] [RVC] Creating instance ")
|
||||
self.deviceManager = DeviceManager.get_instance()
|
||||
EmbedderManager.initialize(params)
|
||||
PitchExtractorManager.initialize(params)
|
||||
self.settings = RVCSettings()
|
||||
self.params = params
|
||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
|
||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||
|
||||
self.pipeline: Pipeline | None = None
|
||||
|
||||
@ -76,7 +77,7 @@ class RVC(VoiceChangerModel):
|
||||
elif key in self.settings.strData:
|
||||
setattr(self.settings, key, str(val))
|
||||
if key == "f0Detector" and self.pipeline is not None:
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||
self.pipeline.setPitchExtractor(pitchExtractor)
|
||||
else:
|
||||
return False
|
||||
@ -201,21 +202,21 @@ class RVC(VoiceChangerModel):
|
||||
def __del__(self):
|
||||
del self.pipeline
|
||||
|
||||
print("---------- REMOVING ---------------")
|
||||
# print("---------- REMOVING ---------------")
|
||||
|
||||
remove_path = os.path.join("RVC")
|
||||
sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
|
||||
# remove_path = os.path.join("RVC")
|
||||
# sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
|
||||
|
||||
for key in list(sys.modules):
|
||||
val = sys.modules.get(key)
|
||||
try:
|
||||
file_path = val.__file__
|
||||
if file_path.find("RVC" + os.path.sep) >= 0:
|
||||
# print("remove", key, file_path)
|
||||
sys.modules.pop(key)
|
||||
except Exception: # type:ignore
|
||||
# print(e)
|
||||
pass
|
||||
# for key in list(sys.modules):
|
||||
# val = sys.modules.get(key)
|
||||
# try:
|
||||
# file_path = val.__file__
|
||||
# if file_path.find("RVC" + os.path.sep) >= 0:
|
||||
# # print("remove", key, file_path)
|
||||
# sys.modules.pop(key)
|
||||
# except Exception: # type:ignore
|
||||
# # print(e)
|
||||
# pass
|
||||
|
||||
def export2onnx(self):
|
||||
modelSlot = self.slotInfo
|
||||
|
@ -4,6 +4,7 @@ from const import EnumInferenceTypes
|
||||
|
||||
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
|
||||
|
||||
|
||||
class OnnxRVCInferencerNono(OnnxRVCInferencer):
|
||||
def loadModel(self, file: str, gpu: int):
|
||||
super().loadModel(file, gpu)
|
||||
|
@ -208,14 +208,13 @@ class Pipeline(object):
|
||||
# apply silent front for inference
|
||||
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||
npyOffset = math.floor(silence_front * 16000) // 360
|
||||
feats = feats[:, npyOffset * 2 :, :]
|
||||
feats = feats[:, npyOffset * 2 :, :] # NOQA
|
||||
feats_len = feats.shape[1]
|
||||
if pitch is not None and pitchf is not None:
|
||||
pitch = pitch[:, -feats_len:]
|
||||
pitchf = pitchf[:, -feats_len:]
|
||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
||||
|
||||
|
||||
# 推論実行
|
||||
try:
|
||||
with torch.no_grad():
|
||||
|
@ -34,7 +34,7 @@ def createPipeline(modelSlot: RVCModelSlot, gpu: int, f0Detector: str):
|
||||
traceback.print_exc()
|
||||
|
||||
# pitchExtractor
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector)
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
||||
|
||||
# index, feature
|
||||
index = _loadIndex(modelSlot)
|
||||
@ -67,7 +67,7 @@ def _loadIndex(modelSlot: RVCModelSlot):
|
||||
try:
|
||||
print("Try loading...", modelSlot.indexFile)
|
||||
index = faiss.read_index(modelSlot.indexFile)
|
||||
except:
|
||||
except: # NOQA
|
||||
print("[Voice Changer] load index failed. Use no index.")
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
@ -0,0 +1,68 @@
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
import onnxruntime
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
|
||||
|
||||
class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||
|
||||
def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
|
||||
self.pitchExtractorType = pitchExtractorType
|
||||
super().__init__()
|
||||
(
|
||||
onnxProviders,
|
||||
onnxProviderOptions,
|
||||
) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
|
||||
|
||||
self.onnx_session = onnxruntime.InferenceSession(
|
||||
file, providers=onnxProviders, provider_options=onnxProviderOptions
|
||||
)
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||
audio = audio[silence_front_offset:]
|
||||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
precision = 10.0
|
||||
|
||||
audio_num = audio.cpu()
|
||||
onnx_f0, onnx_pd = onnxcrepe.predict(
|
||||
self.onnx_session,
|
||||
audio_num,
|
||||
sr,
|
||||
precision=precision,
|
||||
fmin=f0_min,
|
||||
fmax=f0_max,
|
||||
batch_size=256,
|
||||
return_periodicity=True,
|
||||
decoder=onnxcrepe.decode.weighted_argmax,
|
||||
)
|
||||
|
||||
f0 = onnxcrepe.filter.median(onnx_f0, 3)
|
||||
pd = onnxcrepe.filter.median(onnx_pd, 3)
|
||||
|
||||
f0[pd < 0.1] = 0
|
||||
f0 = f0.squeeze()
|
||||
|
||||
f0 = np.pad(f0, (start_frame, n_frames - f0.shape[0] - start_frame), 'constant', constant_values=(0, 0))
|
||||
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||
f0bak = pitchf.copy()
|
||||
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
|
||||
f0_mel = np.clip(
|
||||
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
|
||||
)
|
||||
pitch_coarse = f0_mel.astype(int)
|
||||
|
||||
return pitch_coarse, pitchf
|
@ -1,16 +1,16 @@
|
||||
import torchcrepe
|
||||
import torch
|
||||
import numpy as np
|
||||
from const import EnumPitchExtractorTypes
|
||||
from const import PitchExtractorType
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class CrepePitchExtractor(PitchExtractor):
|
||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.crepe
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pitchExtractorType: PitchExtractorType = "crepe"
|
||||
if torch.cuda.is_available():
|
||||
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||
else:
|
||||
|
@ -1,16 +1,19 @@
|
||||
import pyworld
|
||||
import numpy as np
|
||||
from const import EnumPitchExtractorTypes
|
||||
from const import PitchExtractorType
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class DioPitchExtractor(PitchExtractor):
|
||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pitchExtractorType: PitchExtractorType = "dio"
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
@ -45,4 +48,3 @@ class DioPitchExtractor(PitchExtractor):
|
||||
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||
|
||||
return pitch_coarse, pitchf
|
||||
|
||||
|
@ -1,17 +1,20 @@
|
||||
import pyworld
|
||||
import numpy as np
|
||||
import scipy.signal as signal
|
||||
from const import EnumPitchExtractorTypes
|
||||
from const import PitchExtractorType
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class HarvestPitchExtractor(PitchExtractor):
|
||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pitchExtractorType: PitchExtractorType = "harvest"
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
|
@ -1,14 +1,12 @@
|
||||
from typing import Protocol
|
||||
from const import EnumPitchExtractorTypes
|
||||
|
||||
|
||||
class PitchExtractor(Protocol):
|
||||
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
|
||||
|
||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||
...
|
||||
|
||||
def getPitchExtractorInfo(self):
|
||||
return {
|
||||
"pitchExtractorType": self.pitchExtractorType.value,
|
||||
"pitchExtractorType": self.pitchExtractorType,
|
||||
}
|
||||
|
@ -1,40 +1,42 @@
|
||||
from typing import Protocol
|
||||
from const import EnumPitchExtractorTypes
|
||||
from const import PitchExtractorType
|
||||
from voice_changer.RVC.pitchExtractor.CrepeOnnxPitchExtractor import CrepeOnnxPitchExtractor
|
||||
from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
|
||||
from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
|
||||
from voice_changer.RVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
|
||||
|
||||
class PitchExtractorManager(Protocol):
|
||||
currentPitchExtractor: PitchExtractor | None = None
|
||||
params: VoiceChangerParams
|
||||
|
||||
@classmethod
|
||||
def initialize(cls, params: VoiceChangerParams):
|
||||
cls.params = params
|
||||
|
||||
@classmethod
|
||||
def getPitchExtractor(
|
||||
cls, pitchExtractorType: EnumPitchExtractorTypes
|
||||
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||
) -> PitchExtractor:
|
||||
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType)
|
||||
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType, gpu)
|
||||
return cls.currentPitchExtractor
|
||||
|
||||
@classmethod
|
||||
def loadPitchExtractor(
|
||||
cls, pitchExtractorType: EnumPitchExtractorTypes
|
||||
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||
) -> PitchExtractor:
|
||||
if (
|
||||
pitchExtractorType == EnumPitchExtractorTypes.harvest
|
||||
or pitchExtractorType == EnumPitchExtractorTypes.harvest.value
|
||||
):
|
||||
if pitchExtractorType == "harvest":
|
||||
return HarvestPitchExtractor()
|
||||
elif (
|
||||
pitchExtractorType == EnumPitchExtractorTypes.dio
|
||||
or pitchExtractorType == EnumPitchExtractorTypes.dio.value
|
||||
):
|
||||
elif pitchExtractorType == "dio":
|
||||
return DioPitchExtractor()
|
||||
elif (
|
||||
pitchExtractorType == EnumPitchExtractorTypes.crepe
|
||||
or pitchExtractorType == EnumPitchExtractorTypes.crepe.value
|
||||
):
|
||||
elif pitchExtractorType == "crepe":
|
||||
return CrepePitchExtractor()
|
||||
elif pitchExtractorType == "crepe_tiny":
|
||||
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
|
||||
elif pitchExtractorType == "crepe_full":
|
||||
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
|
||||
else:
|
||||
# return hubert as default
|
||||
raise RuntimeError(
|
||||
|
@ -0,0 +1,8 @@
|
||||
from . import decode # NOQA
|
||||
from .core import * # NOQA
|
||||
from . import convert # NOQA
|
||||
from . import filter # NOQA
|
||||
from . import load # NOQA
|
||||
from . import loudness # NOQA
|
||||
from .session import CrepeInferenceSession # NOQA
|
||||
from . import threshold # NOQA
|
57
server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
Normal file
57
server/voice_changer/RVC/pitchExtractor/onnxcrepe/convert.py
Normal file
@ -0,0 +1,57 @@
|
||||
import numpy as np
|
||||
import scipy
|
||||
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Pitch unit conversions
|
||||
###############################################################################
|
||||
|
||||
|
||||
def bins_to_cents(bins, apply_dither=False):
|
||||
"""Converts pitch bins to cents"""
|
||||
cents = onnxcrepe.CENTS_PER_BIN * bins + 1997.3794084376191
|
||||
|
||||
# Trade quantization error for noise (disabled by default)
|
||||
return dither(cents) if apply_dither else cents
|
||||
|
||||
|
||||
def bins_to_frequency(bins, apply_dither=False):
|
||||
"""Converts pitch bins to frequency in Hz"""
|
||||
return cents_to_frequency(bins_to_cents(bins, apply_dither=apply_dither))
|
||||
|
||||
|
||||
def cents_to_bins(cents, quantize_fn=np.floor):
|
||||
"""Converts cents to pitch bins"""
|
||||
bins = (cents - 1997.3794084376191) / onnxcrepe.CENTS_PER_BIN
|
||||
return quantize_fn(bins).astype(np.int64)
|
||||
|
||||
|
||||
def cents_to_frequency(cents):
|
||||
"""Converts cents to frequency in Hz"""
|
||||
return 10 * 2 ** (cents / 1200)
|
||||
|
||||
|
||||
def frequency_to_bins(frequency, quantize_fn=np.floor):
|
||||
"""Convert frequency in Hz to pitch bins"""
|
||||
return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
|
||||
|
||||
|
||||
def frequency_to_cents(frequency):
|
||||
"""Convert frequency in Hz to cents"""
|
||||
return 1200 * np.log2(frequency / 10.)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Utilities
|
||||
###############################################################################
|
||||
|
||||
|
||||
def dither(cents):
|
||||
"""Dither the predicted pitch in cents to remove quantization error"""
|
||||
noise = scipy.stats.triang.rvs(c=0.5,
|
||||
loc=-onnxcrepe.CENTS_PER_BIN,
|
||||
scale=2 * onnxcrepe.CENTS_PER_BIN,
|
||||
size=cents.shape)
|
||||
return cents + noise.astype(cents.dtype)
|
256
server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
Normal file
256
server/voice_changer/RVC/pitchExtractor/onnxcrepe/core.py
Normal file
@ -0,0 +1,256 @@
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
|
||||
__all__ = ['CENTS_PER_BIN',
|
||||
'MAX_FMAX',
|
||||
'PITCH_BINS',
|
||||
'SAMPLE_RATE',
|
||||
'WINDOW_SIZE',
|
||||
'UNVOICED',
|
||||
'predict',
|
||||
'preprocess',
|
||||
'infer',
|
||||
'postprocess',
|
||||
'resample']
|
||||
|
||||
###############################################################################
|
||||
# Constants
|
||||
###############################################################################
|
||||
|
||||
|
||||
CENTS_PER_BIN = 20 # cents
|
||||
MAX_FMAX = 2006. # hz
|
||||
PITCH_BINS = 360
|
||||
SAMPLE_RATE = 16000 # hz
|
||||
WINDOW_SIZE = 1024 # samples
|
||||
UNVOICED = np.nan
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Crepe pitch prediction
|
||||
###############################################################################
|
||||
|
||||
|
||||
def predict(session,
|
||||
audio,
|
||||
sample_rate,
|
||||
precision=None,
|
||||
fmin=50.,
|
||||
fmax=MAX_FMAX,
|
||||
decoder=onnxcrepe.decode.weighted_viterbi,
|
||||
return_periodicity=False,
|
||||
batch_size=None,
|
||||
pad=True):
|
||||
"""Performs pitch estimation
|
||||
|
||||
Arguments
|
||||
session (onnxcrepe.CrepeInferenceSession)
|
||||
An onnxruntime.InferenceSession holding the CREPE model
|
||||
audio (numpy.ndarray [shape=(n_samples,)])
|
||||
The audio signal
|
||||
sample_rate (int)
|
||||
The sampling rate in Hz
|
||||
precision (float)
|
||||
The precision in milliseconds, i.e. the length of each frame
|
||||
fmin (float)
|
||||
The minimum allowable frequency in Hz
|
||||
fmax (float)
|
||||
The maximum allowable frequency in Hz
|
||||
decoder (function)
|
||||
The decoder to use. See decode.py for decoders.
|
||||
return_periodicity (bool)
|
||||
Whether to also return the network confidence
|
||||
batch_size (int)
|
||||
The number of frames per batch
|
||||
pad (bool)
|
||||
Whether to zero-pad the audio
|
||||
|
||||
Returns
|
||||
pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
|
||||
(Optional) periodicity (numpy.ndarray
|
||||
[shape=(1, 1 + int(time // precision))])
|
||||
"""
|
||||
|
||||
results = []
|
||||
|
||||
# Preprocess audio
|
||||
generator = preprocess(audio,
|
||||
sample_rate,
|
||||
precision,
|
||||
batch_size,
|
||||
pad)
|
||||
for frames in generator:
|
||||
|
||||
# Infer independent probabilities for each pitch bin
|
||||
probabilities = infer(session, frames) # shape=(batch, 360)
|
||||
|
||||
probabilities = probabilities.transpose(1, 0)[None] # shape=(1, 360, batch)
|
||||
|
||||
# Convert probabilities to F0 and periodicity
|
||||
result = postprocess(probabilities,
|
||||
fmin,
|
||||
fmax,
|
||||
decoder,
|
||||
return_periodicity)
|
||||
|
||||
# Place on same device as audio to allow very long inputs
|
||||
if isinstance(result, tuple):
|
||||
result = (result[0], result[1])
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Split pitch and periodicity
|
||||
if return_periodicity:
|
||||
pitch, periodicity = zip(*results)
|
||||
return np.concatenate(pitch, axis=1), np.concatenate(periodicity, axis=1)
|
||||
|
||||
# Concatenate
|
||||
return np.concatenate(results, axis=1)
|
||||
|
||||
|
||||
def preprocess(audio,
|
||||
sample_rate,
|
||||
precision=None,
|
||||
batch_size=None,
|
||||
pad=True):
|
||||
"""Convert audio to model input
|
||||
|
||||
Arguments
|
||||
audio (numpy.ndarray [shape=(time,)])
|
||||
The audio signals
|
||||
sample_rate (int)
|
||||
The sampling rate in Hz
|
||||
precision (float)
|
||||
The precision in milliseconds, i.e. the length of each frame
|
||||
batch_size (int)
|
||||
The number of frames per batch
|
||||
pad (bool)
|
||||
Whether to zero-pad the audio
|
||||
|
||||
Returns
|
||||
frames (numpy.ndarray [shape=(1 + int(time // precision), 1024)])
|
||||
"""
|
||||
# Resample
|
||||
if sample_rate != SAMPLE_RATE:
|
||||
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
|
||||
|
||||
# Default hop length of 10 ms
|
||||
hop_length = SAMPLE_RATE / 100 if precision is None else SAMPLE_RATE * precision / 1000
|
||||
|
||||
# Get total number of frames
|
||||
|
||||
# Maybe pad
|
||||
if pad:
|
||||
total_frames = 1 + int(audio.shape[0] / hop_length)
|
||||
audio = np.pad(
|
||||
audio,
|
||||
(WINDOW_SIZE // 2, WINDOW_SIZE // 2))
|
||||
else:
|
||||
total_frames = 1 + int((audio.shape[0] - WINDOW_SIZE) / hop_length)
|
||||
|
||||
# Default to running all frames in a single batch
|
||||
batch_size = total_frames if batch_size is None else batch_size
|
||||
|
||||
# Generate batches
|
||||
for i in range(0, total_frames, batch_size):
|
||||
# Batch indices
|
||||
start = max(0, int(i * hop_length))
|
||||
end = min(audio.shape[0],
|
||||
int((i + batch_size - 1) * hop_length) + WINDOW_SIZE)
|
||||
|
||||
# Chunk
|
||||
n_bytes = audio.strides[-1]
|
||||
frames = np.lib.stride_tricks.as_strided(
|
||||
audio[start:end],
|
||||
shape=((end - start - WINDOW_SIZE) // int(hop_length) + 1, WINDOW_SIZE),
|
||||
strides=(int(hop_length) * n_bytes, n_bytes)) # shape=(batch, 1024)
|
||||
|
||||
# Note:
|
||||
# Z-score standardization operations originally located here
|
||||
# (https://github.com/maxrmorrison/torchcrepe/blob/master/torchcrepe/core.py#L692)
|
||||
# are wrapped into the ONNX models for hardware acceleration.
|
||||
|
||||
yield frames
|
||||
|
||||
|
||||
def infer(session, frames):
|
||||
"""Forward pass through the model
|
||||
|
||||
Arguments
|
||||
session (onnxcrepe.CrepeInferenceSession)
|
||||
An onnxruntime.InferenceSession holding the CREPE model
|
||||
frames (numpy.ndarray [shape=(time / precision, 1024)])
|
||||
The network input
|
||||
|
||||
Returns
|
||||
logits (numpy.ndarray [shape=(1 + int(time // precision), 360)])
|
||||
"""
|
||||
# Apply model
|
||||
return session.run(None, {'frames': frames})[0]
|
||||
|
||||
|
||||
def postprocess(probabilities,
|
||||
fmin=0.,
|
||||
fmax=MAX_FMAX,
|
||||
decoder=onnxcrepe.decode.weighted_viterbi,
|
||||
return_periodicity=False):
|
||||
"""Convert model output to F0 and periodicity
|
||||
|
||||
Arguments
|
||||
probabilities (numpy.ndarray [shape=(1, 360, time / precision)])
|
||||
The probabilities for each pitch bin inferred by the network
|
||||
fmin (float)
|
||||
The minimum allowable frequency in Hz
|
||||
fmax (float)
|
||||
The maximum allowable frequency in Hz
|
||||
decoder (function)
|
||||
The decoder to use. See decode.py for decoders.
|
||||
return_periodicity (bool)
|
||||
Whether to also return the network confidence
|
||||
|
||||
Returns
|
||||
pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
|
||||
periodicity (numpy.ndarray [shape=(1, 1 + int(time // precision))])
|
||||
"""
|
||||
# Convert frequency range to pitch bin range
|
||||
minidx = onnxcrepe.convert.frequency_to_bins(fmin)
|
||||
maxidx = onnxcrepe.convert.frequency_to_bins(fmax, np.ceil)
|
||||
|
||||
# Remove frequencies outside allowable range
|
||||
probabilities[:, :minidx] = float('-inf')
|
||||
probabilities[:, maxidx:] = float('-inf')
|
||||
|
||||
# Perform argmax or viterbi sampling
|
||||
bins, pitch = decoder(probabilities)
|
||||
|
||||
if not return_periodicity:
|
||||
return pitch
|
||||
|
||||
# Compute periodicity from probabilities and decoded pitch bins
|
||||
return pitch, periodicity(probabilities, bins)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Utilities
|
||||
###############################################################################
|
||||
|
||||
|
||||
def periodicity(probabilities, bins):
|
||||
"""Computes the periodicity from the network output and pitch bins"""
|
||||
# shape=(time / precision, 360)
|
||||
probs_stacked = probabilities.transpose(0, 2, 1).reshape(-1, PITCH_BINS)
|
||||
# shape=(time / precision, 1)
|
||||
bins_stacked = bins.reshape(-1, 1).astype(np.int64)
|
||||
|
||||
# Use maximum logit over pitch bins as periodicity
|
||||
periodicity = np.take_along_axis(probs_stacked, bins_stacked, axis=1)
|
||||
|
||||
# shape=(batch, time / precision)
|
||||
return periodicity.reshape(probabilities.shape[0], probabilities.shape[2])
|
||||
|
||||
|
||||
def resample(audio, sample_rate):
|
||||
"""Resample audio"""
|
||||
return librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
|
80
server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
Normal file
80
server/voice_changer/RVC/pitchExtractor/onnxcrepe/decode.py
Normal file
@ -0,0 +1,80 @@
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
|
||||
###############################################################################
|
||||
# Probability sequence decoding methods
|
||||
###############################################################################
|
||||
|
||||
|
||||
def argmax(logits):
|
||||
"""Sample observations by taking the argmax"""
|
||||
bins = logits.argmax(axis=1)
|
||||
|
||||
# Convert to frequency in Hz
|
||||
return bins, onnxcrepe.convert.bins_to_frequency(bins)
|
||||
|
||||
|
||||
def weighted_argmax(logits: np.ndarray):
|
||||
"""Sample observations using weighted sum near the argmax"""
|
||||
# Find center of analysis window
|
||||
bins = logits.argmax(axis=1)
|
||||
|
||||
return bins, _apply_weights(logits, bins)
|
||||
|
||||
|
||||
def viterbi(logits):
|
||||
"""Sample observations using viterbi decoding"""
|
||||
# Create viterbi transition matrix
|
||||
if not hasattr(viterbi, 'transition'):
|
||||
xx, yy = np.meshgrid(range(360), range(360))
|
||||
transition = np.maximum(12 - abs(xx - yy), 0)
|
||||
transition = transition / transition.sum(axis=1, keepdims=True)
|
||||
viterbi.transition = transition
|
||||
|
||||
# Normalize logits (softmax)
|
||||
logits -= logits.max(axis=1)
|
||||
exp = np.exp(logits)
|
||||
probs = exp / np.sum(exp, axis=1)
|
||||
|
||||
# Perform viterbi decoding
|
||||
bins = np.array([
|
||||
librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
|
||||
for sequence in probs])
|
||||
|
||||
# Convert to frequency in Hz
|
||||
return bins, onnxcrepe.convert.bins_to_frequency(bins)
|
||||
|
||||
|
||||
def weighted_viterbi(logits):
|
||||
"""Sample observations combining viterbi decoding and weighted argmax"""
|
||||
bins, _ = viterbi(logits)
|
||||
|
||||
return bins, _apply_weights(logits, bins)
|
||||
|
||||
|
||||
def _apply_weights(logits, bins):
|
||||
# Find bounds of analysis window
|
||||
start = np.maximum(0, bins - 4)
|
||||
end = np.minimum(logits.shape[1], bins + 5)
|
||||
|
||||
# Mask out everything outside of window
|
||||
for batch in range(logits.shape[0]):
|
||||
for time in range(logits.shape[2]):
|
||||
logits[batch, :start[batch, time], time] = float('-inf')
|
||||
logits[batch, end[batch, time]:, time] = float('-inf')
|
||||
|
||||
# Construct weights
|
||||
if not hasattr(_apply_weights, 'weights'):
|
||||
weights = onnxcrepe.convert.bins_to_cents(np.arange(360))
|
||||
_apply_weights.weights = weights[None, :, None]
|
||||
|
||||
# Convert to probabilities (ReLU)
|
||||
probs = np.maximum(0, logits)
|
||||
|
||||
# Apply weights
|
||||
cents = (_apply_weights.weights * probs).sum(axis=1) / probs.sum(axis=1)
|
||||
|
||||
# Convert to frequency in Hz
|
||||
return onnxcrepe.convert.cents_to_frequency(cents)
|
125
server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
Normal file
125
server/voice_changer/RVC/pitchExtractor/onnxcrepe/filter.py
Normal file
@ -0,0 +1,125 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Sequence filters
|
||||
###############################################################################
|
||||
|
||||
|
||||
def mean(signals, win_length=9):
|
||||
"""Averave filtering for signals containing nan values
|
||||
|
||||
Arguments
|
||||
signals (numpy.ndarray (shape=(batch, time)))
|
||||
The signals to filter
|
||||
win_length
|
||||
The size of the analysis window
|
||||
|
||||
Returns
|
||||
filtered (numpy.ndarray (shape=(batch, time)))
|
||||
"""
|
||||
return nanfilter(signals, win_length, nanmean)
|
||||
|
||||
|
||||
def median(signals, win_length):
|
||||
"""Median filtering for signals containing nan values
|
||||
|
||||
Arguments
|
||||
signals (numpy.ndarray (shape=(batch, time)))
|
||||
The signals to filter
|
||||
win_length
|
||||
The size of the analysis window
|
||||
|
||||
Returns
|
||||
filtered (numpy.ndarray (shape=(batch, time)))
|
||||
"""
|
||||
return nanfilter(signals, win_length, nanmedian)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Utilities
|
||||
###############################################################################
|
||||
|
||||
|
||||
def nanfilter(signals, win_length, filter_fn):
|
||||
"""Filters a sequence, ignoring nan values
|
||||
|
||||
Arguments
|
||||
signals (numpy.ndarray (shape=(batch, time)))
|
||||
The signals to filter
|
||||
win_length
|
||||
The size of the analysis window
|
||||
filter_fn (function)
|
||||
The function to use for filtering
|
||||
|
||||
Returns
|
||||
filtered (numpy.ndarray (shape=(batch, time)))
|
||||
"""
|
||||
# Output buffer
|
||||
filtered = np.empty_like(signals)
|
||||
|
||||
# Loop over frames
|
||||
for i in range(signals.shape[1]):
|
||||
|
||||
# Get analysis window bounds
|
||||
start = max(0, i - win_length // 2)
|
||||
end = min(signals.shape[1], i + win_length // 2 + 1)
|
||||
|
||||
# Apply filter to window
|
||||
filtered[:, i] = filter_fn(signals[:, start:end])
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
def nanmean(signals):
|
||||
"""Computes the mean, ignoring nans
|
||||
|
||||
Arguments
|
||||
signals (numpy.ndarray [shape=(batch, time)])
|
||||
The signals to filter
|
||||
|
||||
Returns
|
||||
filtered (numpy.ndarray [shape=(batch, time)])
|
||||
"""
|
||||
signals = signals.clone()
|
||||
|
||||
# Find nans
|
||||
nans = np.isnan(signals)
|
||||
|
||||
# Set nans to 0.
|
||||
signals[nans] = 0.
|
||||
|
||||
# Compute average
|
||||
return signals.sum(axis=1) / (~nans).astype(np.float32).sum(axis=1)
|
||||
|
||||
|
||||
def nanmedian(signals):
|
||||
"""Computes the median, ignoring nans
|
||||
|
||||
Arguments
|
||||
signals (numpy.ndarray [shape=(batch, time)])
|
||||
The signals to filter
|
||||
|
||||
Returns
|
||||
filtered (numpy.ndarray [shape=(batch, time)])
|
||||
"""
|
||||
# Find nans
|
||||
nans = np.isnan(signals)
|
||||
|
||||
# Compute median for each slice
|
||||
medians = [nanmedian1d(signal[~nan]) for signal, nan in zip(signals, nans)]
|
||||
|
||||
# Stack results
|
||||
return np.array(medians, dtype=signals.dtype)
|
||||
|
||||
|
||||
def nanmedian1d(signal):
|
||||
"""Computes the median. If signal is empty, returns torch.nan
|
||||
|
||||
Arguments
|
||||
signal (numpy.ndarray [shape=(time,)])
|
||||
|
||||
Returns
|
||||
median (numpy.ndarray [shape=(1,)])
|
||||
"""
|
||||
return np.median(signal) if signal.size else np.nan
|
12
server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
Normal file
12
server/voice_changer/RVC/pitchExtractor/onnxcrepe/load.py
Normal file
@ -0,0 +1,12 @@
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
|
||||
def audio(filename):
|
||||
"""Load audio from disk"""
|
||||
samples, sr = librosa.load(filename, sr=None)
|
||||
if len(samples.shape) > 1:
|
||||
# To mono
|
||||
samples = np.mean(samples, axis=1)
|
||||
|
||||
return samples, sr
|
@ -0,0 +1,73 @@
|
||||
import warnings
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Constants
|
||||
###############################################################################
|
||||
|
||||
|
||||
# Minimum decibel level
|
||||
MIN_DB = -100.
|
||||
|
||||
# Reference decibel level
|
||||
REF_DB = 20.
|
||||
|
||||
|
||||
###############################################################################
|
||||
# A-weighted loudness
|
||||
###############################################################################
|
||||
|
||||
|
||||
def a_weighted(audio, sample_rate, hop_length=None, pad=True):
|
||||
"""Retrieve the per-frame loudness"""
|
||||
|
||||
# Default hop length of 10 ms
|
||||
hop_length = sample_rate // 100 if hop_length is None else hop_length
|
||||
|
||||
# Convert to numpy
|
||||
audio = audio.squeeze(0)
|
||||
|
||||
# Resample
|
||||
if sample_rate != onnxcrepe.SAMPLE_RATE:
|
||||
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
|
||||
hop_length = int(hop_length * onnxcrepe.SAMPLE_RATE / sample_rate)
|
||||
|
||||
# Cache weights
|
||||
if not hasattr(a_weighted, 'weights'):
|
||||
a_weighted.weights = perceptual_weights()
|
||||
|
||||
# Take stft
|
||||
stft = librosa.stft(audio,
|
||||
n_fft=onnxcrepe.WINDOW_SIZE,
|
||||
hop_length=hop_length,
|
||||
win_length=onnxcrepe.WINDOW_SIZE,
|
||||
center=pad,
|
||||
pad_mode='constant')
|
||||
|
||||
# Compute magnitude on db scale
|
||||
db = librosa.amplitude_to_db(np.abs(stft))
|
||||
|
||||
# Apply A-weighting
|
||||
weighted = db + a_weighted.weights
|
||||
|
||||
# Threshold
|
||||
weighted[weighted < MIN_DB] = MIN_DB
|
||||
|
||||
# Average over weighted frequencies
|
||||
return weighted.mean(axis=0).astype(np.float32)[None]
|
||||
|
||||
|
||||
def perceptual_weights():
|
||||
"""A-weighted frequency-dependent perceptual loudness weights"""
|
||||
frequencies = librosa.fft_frequencies(sr=onnxcrepe.SAMPLE_RATE,
|
||||
n_fft=onnxcrepe.WINDOW_SIZE)
|
||||
|
||||
# A warning is raised for nearly inaudible frequencies, but it ends up
|
||||
# defaulting to -100 db. That default is fine for our purposes.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter('ignore', RuntimeWarning)
|
||||
return librosa.A_weighting(frequencies)[:, None] - REF_DB
|
@ -0,0 +1 @@
|
||||
modules in this folder from https://github.com/yqzhishen/onnxcrepe at ca7e5d7f2dfca5cc4d99e8d546b00793ca4e7157
|
@ -0,0 +1,9 @@
|
||||
import os
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
|
||||
class CrepeInferenceSession(ort.InferenceSession):
|
||||
def __init__(self, model='full', sess_options=None, providers=None, provider_options=None, **kwargs):
|
||||
model_path = os.path.join(os.path.dirname(__file__), 'assets', f'{model}.onnx')
|
||||
super().__init__(model_path, sess_options, providers, provider_options, **kwargs)
|
129
server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
Normal file
129
server/voice_changer/RVC/pitchExtractor/onnxcrepe/threshold.py
Normal file
@ -0,0 +1,129 @@
|
||||
import numpy as np
|
||||
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Pitch thresholding methods
|
||||
###############################################################################
|
||||
|
||||
|
||||
class At:
|
||||
"""Simple thresholding at a specified probability value"""
|
||||
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
def __call__(self, pitch, periodicity):
|
||||
# Make a copy to prevent in-place modification
|
||||
pitch = pitch.copy()
|
||||
|
||||
# Threshold
|
||||
pitch[periodicity < self.value] = onnxcrepe.UNVOICED
|
||||
return pitch
|
||||
|
||||
|
||||
class Hysteresis:
|
||||
"""Hysteresis thresholding"""
|
||||
|
||||
def __init__(self,
|
||||
lower_bound=.19,
|
||||
upper_bound=.31,
|
||||
width=.2,
|
||||
stds=1.7,
|
||||
return_threshold=False):
|
||||
self.lower_bound = lower_bound
|
||||
self.upper_bound = upper_bound
|
||||
self.width = width
|
||||
self.stds = stds
|
||||
self.return_threshold = return_threshold
|
||||
|
||||
def __call__(self, pitch, periodicity):
|
||||
|
||||
# Perform hysteresis in log-2 space
|
||||
pitch = np.log2(pitch).flatten()
|
||||
|
||||
# Flatten periodicity
|
||||
periodicity = periodicity.flatten()
|
||||
|
||||
# Ignore confidently unvoiced pitch
|
||||
pitch[periodicity < self.lower_bound] = onnxcrepe.UNVOICED
|
||||
|
||||
# Whiten pitch
|
||||
mean, std = np.nanmean(pitch), np.nanstd(pitch)
|
||||
pitch = (pitch - mean) / std
|
||||
|
||||
# Require high confidence to make predictions far from the mean
|
||||
parabola = self.width * pitch ** 2 - self.width * self.stds ** 2
|
||||
threshold = self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound)
|
||||
threshold[np.isnan(threshold)] = self.lower_bound
|
||||
|
||||
# Apply hysteresis to prevent short, unconfident voiced regions
|
||||
i = 0
|
||||
while i < len(periodicity) - 1:
|
||||
|
||||
# Detect unvoiced to voiced transition
|
||||
if periodicity[i] < threshold[i] and periodicity[i + 1] > threshold[i + 1]:
|
||||
|
||||
# Grow region until next unvoiced or end of array
|
||||
start, end, keep = i + 1, i + 1, False
|
||||
while end < len(periodicity) and periodicity[end] > threshold[end]:
|
||||
if periodicity[end] > self.upper_bound:
|
||||
keep = True
|
||||
end += 1
|
||||
|
||||
# Force unvoiced if we didn't pass the confidence required by
|
||||
# the hysteresis
|
||||
if not keep:
|
||||
threshold[start:end] = 1
|
||||
|
||||
i = end
|
||||
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Remove pitch with low periodicity
|
||||
pitch[periodicity < threshold] = onnxcrepe.UNVOICED
|
||||
|
||||
# Unwhiten
|
||||
pitch = pitch * std + mean
|
||||
|
||||
# Convert to Hz
|
||||
pitch = np.array(2 ** pitch)[None, :]
|
||||
|
||||
# Optionally return threshold
|
||||
if self.return_threshold:
|
||||
return pitch, np.array(threshold)
|
||||
|
||||
return pitch
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Periodicity thresholding methods
|
||||
###############################################################################
|
||||
|
||||
|
||||
class Silence:
|
||||
"""Set periodicity to zero in silent regions"""
|
||||
|
||||
def __init__(self, value=-60):
|
||||
self.value = value
|
||||
|
||||
def __call__(self,
|
||||
periodicity,
|
||||
audio,
|
||||
sample_rate=onnxcrepe.SAMPLE_RATE,
|
||||
precision=None,
|
||||
pad=True):
|
||||
# Don't modify in-place
|
||||
periodicity = periodicity.copy()
|
||||
|
||||
# Compute loudness
|
||||
hop_length = sample_rate * precision // 1000
|
||||
loudness = onnxcrepe.loudness.a_weighted(
|
||||
audio, sample_rate, hop_length, pad)
|
||||
|
||||
# Threshold silence
|
||||
periodicity[loudness < self.value] = 0.
|
||||
|
||||
return periodicity
|
@ -12,3 +12,5 @@ class VoiceChangerParams:
|
||||
hubert_soft: str
|
||||
nsf_hifigan: str
|
||||
sample_mode: str
|
||||
crepe_onnx_full: str
|
||||
crepe_onnx_tiny: str
|
||||
|
Loading…
x
Reference in New Issue
Block a user