New Feature:

- Add Crepe Full/Tiny (onnx)
 - remove test connect for local
Refactor:
 - RVC: comment out module importer
This commit is contained in:
w-okada 2023-07-07 02:17:29 +09:00
parent 099f82cc60
commit d5561c2212
32 changed files with 2068 additions and 117 deletions

View File

@ -21,7 +21,7 @@
{
"name": "configArea",
"options": {
"detectors": ["dio", "harvest", "crepe"],
"detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
}
}

View File

@ -1 +1,10 @@
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
<!DOCTYPE html>
<html style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>Voice Changer Client Demo</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
</body>
</html>

File diff suppressed because one or more lines are too long

View File

@ -1,31 +0,0 @@
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
/**
* @license React
* react-dom.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* react.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* scheduler.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

View File

@ -21,7 +21,7 @@
{
"name": "configArea",
"options": {
"detectors": ["dio", "harvest", "crepe"],
"detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny"],
"inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
}
}

View File

@ -42,8 +42,9 @@ export type CrossFadeOverlapSize = typeof CrossFadeOverlapSize[keyof typeof Cros
export const F0Detector = {
"dio": "dio",
"harvest": "harvest",
// "parselmouth": "parselmouth",
"crepe": "crepe",
"crepe_full": "crepe_full",
"crepe_tiny": "crepe_tiny",
} as const
export type F0Detector = typeof F0Detector[keyof typeof F0Detector]

View File

@ -34,6 +34,7 @@ def setupArgParser():
parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
parser.add_argument("-p", type=int, default=18888, help="port")
parser.add_argument("--https", type=strtobool, default=False, help="use https")
parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8")
parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
@ -48,6 +49,8 @@ def setupArgParser():
parser.add_argument("--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)")
parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)")
parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)")
parser.add_argument("--crepe_onnx_full", type=str, help="path to crepe_onnx_full")
parser.add_argument("--crepe_onnx_tiny", type=str, help="path to crepe_onnx_tiny")
return parser
@ -85,6 +88,9 @@ voiceChangerParams = VoiceChangerParams(
hubert_base_jp=args.hubert_base_jp,
hubert_soft=args.hubert_soft,
nsf_hifigan=args.nsf_hifigan,
crepe_onnx_full=args.crepe_onnx_full,
crepe_onnx_tiny=args.crepe_onnx_tiny,
sample_mode=args.sample_mode,
)
@ -120,6 +126,7 @@ if __name__ == "__mp_main__":
if __name__ == "__main__":
mp.freeze_support()
printMessage(f"PYTHON:{sys.version}", level=2)
printMessage("Voice Changerを起動しています。", level=2)
# ダウンロード(Weight)
try:
@ -195,10 +202,10 @@ if __name__ == "__main__":
else:
printMessage(f"http://localhost:{EX_PORT}/", level=1)
else: # 直接python起動
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
hostname = s.getsockname()[0]
if args.https == 1:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect((args.test_connect, 80))
hostname = s.getsockname()[0]
printMessage(f"https://localhost:{PORT}/", level=1)
printMessage(f"https://{hostname}:{PORT}/", level=1)
else:

View File

@ -73,15 +73,13 @@ class EnumInferenceTypes(Enum):
onnxRVCNono = "onnxRVCNono"
class EnumPitchExtractorTypes(Enum):
harvest = "harvest"
dio = "dio"
crepe = "crepe"
class EnumFrameworkTypes(Enum):
pyTorch = "pyTorch"
onnx = "onnx"
PitchExtractorType: TypeAlias = Literal[
"harvest",
"dio",
"crepe",
"crepe_full",
"crepe_tiny",
]
class ServerAudioDeviceTypes(Enum):

View File

@ -11,6 +11,8 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
hubert_base_jp = voiceChangerParams.hubert_base_jp
hubert_soft = voiceChangerParams.hubert_soft
nsf_hifigan = voiceChangerParams.nsf_hifigan
crepe_onnx_full = voiceChangerParams.crepe_onnx_full
crepe_onnx_tiny = voiceChangerParams.crepe_onnx_tiny
# file exists check (currently only for rvc)
downloadParams = []
@ -57,6 +59,24 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
}
)
if os.path.exists(crepe_onnx_full) is False:
downloadParams.append(
{
"url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/full.onnx",
"saveTo": crepe_onnx_full,
"position": 5,
}
)
if os.path.exists(crepe_onnx_tiny) is False:
downloadParams.append(
{
"url": "https://huggingface.co/wok000/weights/resolve/main/crepe/onnx/tiny.onnx",
"saveTo": crepe_onnx_tiny,
"position": 6,
}
)
with ThreadPoolExecutor() as pool:
pool.map(download, downloadParams)

View File

@ -300,5 +300,5 @@ class MMVCv15:
if file_path.find(remove_path + os.path.sep) >= 0:
# print("remove", key, file_path)
sys.modules.pop(key)
except: # type:ignore
except: # NOQA
pass

View File

@ -1,5 +1,5 @@
import sys
import os
# import sys
# import os
from dataclasses import asdict
import numpy as np
import torch
@ -7,18 +7,18 @@ import torchaudio
from data.ModelSlot import RVCModelSlot
# avoiding parse arg error in RVC
sys.argv = ["MMVCServerSIO.py"]
# # avoiding parse arg error in RVC
# sys.argv = ["MMVCServerSIO.py"]
if sys.platform.startswith("darwin"):
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
if len(baseDir) != 1:
print("baseDir should be only one ", baseDir)
sys.exit()
modulePath = os.path.join(baseDir[0], "RVC")
sys.path.append(modulePath)
else:
sys.path.append("RVC")
# if sys.platform.startswith("darwin"):
# baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
# if len(baseDir) != 1:
# print("baseDir should be only one ", baseDir)
# sys.exit()
# modulePath = os.path.join(baseDir[0], "RVC")
# sys.path.append(modulePath)
# else:
# sys.path.append("RVC")
from voice_changer.RVC.RVCSettings import RVCSettings
@ -39,9 +39,10 @@ class RVC(VoiceChangerModel):
print("[Voice Changer] [RVC] Creating instance ")
self.deviceManager = DeviceManager.get_instance()
EmbedderManager.initialize(params)
PitchExtractorManager.initialize(params)
self.settings = RVCSettings()
self.params = params
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
self.pipeline: Pipeline | None = None
@ -76,7 +77,7 @@ class RVC(VoiceChangerModel):
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
if key == "f0Detector" and self.pipeline is not None:
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
self.pipeline.setPitchExtractor(pitchExtractor)
else:
return False
@ -112,7 +113,7 @@ class RVC(VoiceChangerModel):
self.audio_buffer = newData
if self.slotInfo.f0:
self.pitchf_buffer = np.zeros(new_feature_length)
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
@ -201,21 +202,21 @@ class RVC(VoiceChangerModel):
def __del__(self):
del self.pipeline
print("---------- REMOVING ---------------")
# print("---------- REMOVING ---------------")
remove_path = os.path.join("RVC")
sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
# remove_path = os.path.join("RVC")
# sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
for key in list(sys.modules):
val = sys.modules.get(key)
try:
file_path = val.__file__
if file_path.find("RVC" + os.path.sep) >= 0:
# print("remove", key, file_path)
sys.modules.pop(key)
except Exception: # type:ignore
# print(e)
pass
# for key in list(sys.modules):
# val = sys.modules.get(key)
# try:
# file_path = val.__file__
# if file_path.find("RVC" + os.path.sep) >= 0:
# # print("remove", key, file_path)
# sys.modules.pop(key)
# except Exception: # type:ignore
# # print(e)
# pass
def export2onnx(self):
modelSlot = self.slotInfo

View File

@ -4,6 +4,7 @@ from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
class OnnxRVCInferencerNono(OnnxRVCInferencer):
def loadModel(self, file: str, gpu: int):
super().loadModel(file, gpu)

View File

@ -73,9 +73,9 @@ class Pipeline(object):
def exec(
self,
sid,
audio, # torch.tensor [n]
pitchf, # np.array [m]
feature, # np.array [m, feat]
audio, # torch.tensor [n]
pitchf, # np.array [m]
feature, # np.array [m, feat]
f0_up_key,
index_rate,
if_f0,
@ -208,14 +208,13 @@ class Pipeline(object):
# apply silent front for inference
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
npyOffset = math.floor(silence_front * 16000) // 360
feats = feats[:, npyOffset * 2 :, :]
feats = feats[:, npyOffset * 2 :, :] # NOQA
feats_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
# 推論実行
try:
with torch.no_grad():

View File

@ -34,7 +34,7 @@ def createPipeline(modelSlot: RVCModelSlot, gpu: int, f0Detector: str):
traceback.print_exc()
# pitchExtractor
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector)
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
# index, feature
index = _loadIndex(modelSlot)
@ -67,7 +67,7 @@ def _loadIndex(modelSlot: RVCModelSlot):
try:
print("Try loading...", modelSlot.indexFile)
index = faiss.read_index(modelSlot.indexFile)
except:
except: # NOQA
print("[Voice Changer] load index failed. Use no index.")
traceback.print_exc()
return None

View File

@ -0,0 +1,68 @@
import numpy as np
from const import PitchExtractorType
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
import onnxruntime
from voice_changer.RVC.pitchExtractor import onnxcrepe
class CrepeOnnxPitchExtractor(PitchExtractor):
def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
self.pitchExtractorType = pitchExtractorType
super().__init__()
(
onnxProviders,
onnxProviderOptions,
) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
self.onnx_session = onnxruntime.InferenceSession(
file, providers=onnxProviders, provider_options=onnxProviderOptions
)
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = int(np.round(real_silence_front * sr))
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
precision = 10.0
audio_num = audio.cpu()
onnx_f0, onnx_pd = onnxcrepe.predict(
self.onnx_session,
audio_num,
sr,
precision=precision,
fmin=f0_min,
fmax=f0_max,
batch_size=256,
return_periodicity=True,
decoder=onnxcrepe.decode.weighted_argmax,
)
f0 = onnxcrepe.filter.median(onnx_f0, 3)
pd = onnxcrepe.filter.median(onnx_pd, 3)
f0[pd < 0.1] = 0
f0 = f0.squeeze()
f0 = np.pad(f0, (start_frame, n_frames - f0.shape[0] - start_frame), 'constant', constant_values=(0, 0))
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
f0_mel = np.clip(
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
)
pitch_coarse = f0_mel.astype(int)
return pitch_coarse, pitchf

View File

@ -1,16 +1,16 @@
import torchcrepe
import torch
import numpy as np
from const import EnumPitchExtractorTypes
from const import PitchExtractorType
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class CrepePitchExtractor(PitchExtractor):
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.crepe
def __init__(self):
super().__init__()
self.pitchExtractorType: PitchExtractorType = "crepe"
if torch.cuda.is_available():
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
else:

View File

@ -1,16 +1,19 @@
import pyworld
import numpy as np
from const import EnumPitchExtractorTypes
from const import PitchExtractorType
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class DioPitchExtractor(PitchExtractor):
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio
def __init__(self):
super().__init__()
self.pitchExtractorType: PitchExtractorType = "dio"
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1
n_frames = int(len(audio) // window) + 1 # NOQA
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
@ -31,7 +34,7 @@ class DioPitchExtractor(PitchExtractor):
frame_period=10,
)
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
# f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
# f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
@ -45,4 +48,3 @@ class DioPitchExtractor(PitchExtractor):
pitch_coarse = np.rint(f0_mel).astype(int)
return pitch_coarse, pitchf

View File

@ -1,17 +1,20 @@
import pyworld
import numpy as np
import scipy.signal as signal
from const import EnumPitchExtractorTypes
from const import PitchExtractorType
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class HarvestPitchExtractor(PitchExtractor):
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
def __init__(self):
super().__init__()
self.pitchExtractorType: PitchExtractorType = "harvest"
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1
n_frames = int(len(audio) // window) + 1 # NOQA
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr

View File

@ -1,14 +1,12 @@
from typing import Protocol
from const import EnumPitchExtractorTypes
class PitchExtractor(Protocol):
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
...
def getPitchExtractorInfo(self):
return {
"pitchExtractorType": self.pitchExtractorType.value,
"pitchExtractorType": self.pitchExtractorType,
}

View File

@ -1,40 +1,42 @@
from typing import Protocol
from const import EnumPitchExtractorTypes
from const import PitchExtractorType
from voice_changer.RVC.pitchExtractor.CrepeOnnxPitchExtractor import CrepeOnnxPitchExtractor
from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
from voice_changer.RVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
class PitchExtractorManager(Protocol):
currentPitchExtractor: PitchExtractor | None = None
params: VoiceChangerParams
@classmethod
def initialize(cls, params: VoiceChangerParams):
cls.params = params
@classmethod
def getPitchExtractor(
cls, pitchExtractorType: EnumPitchExtractorTypes
cls, pitchExtractorType: PitchExtractorType, gpu: int
) -> PitchExtractor:
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType)
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType, gpu)
return cls.currentPitchExtractor
@classmethod
def loadPitchExtractor(
cls, pitchExtractorType: EnumPitchExtractorTypes
cls, pitchExtractorType: PitchExtractorType, gpu: int
) -> PitchExtractor:
if (
pitchExtractorType == EnumPitchExtractorTypes.harvest
or pitchExtractorType == EnumPitchExtractorTypes.harvest.value
):
if pitchExtractorType == "harvest":
return HarvestPitchExtractor()
elif (
pitchExtractorType == EnumPitchExtractorTypes.dio
or pitchExtractorType == EnumPitchExtractorTypes.dio.value
):
elif pitchExtractorType == "dio":
return DioPitchExtractor()
elif (
pitchExtractorType == EnumPitchExtractorTypes.crepe
or pitchExtractorType == EnumPitchExtractorTypes.crepe.value
):
elif pitchExtractorType == "crepe":
return CrepePitchExtractor()
elif pitchExtractorType == "crepe_tiny":
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
elif pitchExtractorType == "crepe_full":
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
else:
# return hubert as default
raise RuntimeError(

View File

@ -0,0 +1,8 @@
from . import decode # NOQA
from .core import * # NOQA
from . import convert # NOQA
from . import filter # NOQA
from . import load # NOQA
from . import loudness # NOQA
from .session import CrepeInferenceSession # NOQA
from . import threshold # NOQA

View File

@ -0,0 +1,57 @@
import numpy as np
import scipy
from voice_changer.RVC.pitchExtractor import onnxcrepe
###############################################################################
# Pitch unit conversions
###############################################################################
def bins_to_cents(bins, apply_dither=False):
"""Converts pitch bins to cents"""
cents = onnxcrepe.CENTS_PER_BIN * bins + 1997.3794084376191
# Trade quantization error for noise (disabled by default)
return dither(cents) if apply_dither else cents
def bins_to_frequency(bins, apply_dither=False):
"""Converts pitch bins to frequency in Hz"""
return cents_to_frequency(bins_to_cents(bins, apply_dither=apply_dither))
def cents_to_bins(cents, quantize_fn=np.floor):
"""Converts cents to pitch bins"""
bins = (cents - 1997.3794084376191) / onnxcrepe.CENTS_PER_BIN
return quantize_fn(bins).astype(np.int64)
def cents_to_frequency(cents):
"""Converts cents to frequency in Hz"""
return 10 * 2 ** (cents / 1200)
def frequency_to_bins(frequency, quantize_fn=np.floor):
"""Convert frequency in Hz to pitch bins"""
return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
def frequency_to_cents(frequency):
"""Convert frequency in Hz to cents"""
return 1200 * np.log2(frequency / 10.)
###############################################################################
# Utilities
###############################################################################
def dither(cents):
"""Dither the predicted pitch in cents to remove quantization error"""
noise = scipy.stats.triang.rvs(c=0.5,
loc=-onnxcrepe.CENTS_PER_BIN,
scale=2 * onnxcrepe.CENTS_PER_BIN,
size=cents.shape)
return cents + noise.astype(cents.dtype)

View File

@ -0,0 +1,256 @@
import librosa
import numpy as np
from voice_changer.RVC.pitchExtractor import onnxcrepe
__all__ = ['CENTS_PER_BIN',
'MAX_FMAX',
'PITCH_BINS',
'SAMPLE_RATE',
'WINDOW_SIZE',
'UNVOICED',
'predict',
'preprocess',
'infer',
'postprocess',
'resample']
###############################################################################
# Constants
###############################################################################
CENTS_PER_BIN = 20 # cents
MAX_FMAX = 2006. # hz
PITCH_BINS = 360
SAMPLE_RATE = 16000 # hz
WINDOW_SIZE = 1024 # samples
UNVOICED = np.nan
###############################################################################
# Crepe pitch prediction
###############################################################################
def predict(session,
audio,
sample_rate,
precision=None,
fmin=50.,
fmax=MAX_FMAX,
decoder=onnxcrepe.decode.weighted_viterbi,
return_periodicity=False,
batch_size=None,
pad=True):
"""Performs pitch estimation
Arguments
session (onnxcrepe.CrepeInferenceSession)
An onnxruntime.InferenceSession holding the CREPE model
audio (numpy.ndarray [shape=(n_samples,)])
The audio signal
sample_rate (int)
The sampling rate in Hz
precision (float)
The precision in milliseconds, i.e. the length of each frame
fmin (float)
The minimum allowable frequency in Hz
fmax (float)
The maximum allowable frequency in Hz
decoder (function)
The decoder to use. See decode.py for decoders.
return_periodicity (bool)
Whether to also return the network confidence
batch_size (int)
The number of frames per batch
pad (bool)
Whether to zero-pad the audio
Returns
pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
(Optional) periodicity (numpy.ndarray
[shape=(1, 1 + int(time // precision))])
"""
results = []
# Preprocess audio
generator = preprocess(audio,
sample_rate,
precision,
batch_size,
pad)
for frames in generator:
# Infer independent probabilities for each pitch bin
probabilities = infer(session, frames) # shape=(batch, 360)
probabilities = probabilities.transpose(1, 0)[None] # shape=(1, 360, batch)
# Convert probabilities to F0 and periodicity
result = postprocess(probabilities,
fmin,
fmax,
decoder,
return_periodicity)
# Place on same device as audio to allow very long inputs
if isinstance(result, tuple):
result = (result[0], result[1])
results.append(result)
# Split pitch and periodicity
if return_periodicity:
pitch, periodicity = zip(*results)
return np.concatenate(pitch, axis=1), np.concatenate(periodicity, axis=1)
# Concatenate
return np.concatenate(results, axis=1)
def preprocess(audio,
sample_rate,
precision=None,
batch_size=None,
pad=True):
"""Convert audio to model input
Arguments
audio (numpy.ndarray [shape=(time,)])
The audio signals
sample_rate (int)
The sampling rate in Hz
precision (float)
The precision in milliseconds, i.e. the length of each frame
batch_size (int)
The number of frames per batch
pad (bool)
Whether to zero-pad the audio
Returns
frames (numpy.ndarray [shape=(1 + int(time // precision), 1024)])
"""
# Resample
if sample_rate != SAMPLE_RATE:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
# Default hop length of 10 ms
hop_length = SAMPLE_RATE / 100 if precision is None else SAMPLE_RATE * precision / 1000
# Get total number of frames
# Maybe pad
if pad:
total_frames = 1 + int(audio.shape[0] / hop_length)
audio = np.pad(
audio,
(WINDOW_SIZE // 2, WINDOW_SIZE // 2))
else:
total_frames = 1 + int((audio.shape[0] - WINDOW_SIZE) / hop_length)
# Default to running all frames in a single batch
batch_size = total_frames if batch_size is None else batch_size
# Generate batches
for i in range(0, total_frames, batch_size):
# Batch indices
start = max(0, int(i * hop_length))
end = min(audio.shape[0],
int((i + batch_size - 1) * hop_length) + WINDOW_SIZE)
# Chunk
n_bytes = audio.strides[-1]
frames = np.lib.stride_tricks.as_strided(
audio[start:end],
shape=((end - start - WINDOW_SIZE) // int(hop_length) + 1, WINDOW_SIZE),
strides=(int(hop_length) * n_bytes, n_bytes)) # shape=(batch, 1024)
# Note:
# Z-score standardization operations originally located here
# (https://github.com/maxrmorrison/torchcrepe/blob/master/torchcrepe/core.py#L692)
# are wrapped into the ONNX models for hardware acceleration.
yield frames
def infer(session, frames):
"""Forward pass through the model
Arguments
session (onnxcrepe.CrepeInferenceSession)
An onnxruntime.InferenceSession holding the CREPE model
frames (numpy.ndarray [shape=(time / precision, 1024)])
The network input
Returns
logits (numpy.ndarray [shape=(1 + int(time // precision), 360)])
"""
# Apply model
return session.run(None, {'frames': frames})[0]
def postprocess(probabilities,
fmin=0.,
fmax=MAX_FMAX,
decoder=onnxcrepe.decode.weighted_viterbi,
return_periodicity=False):
"""Convert model output to F0 and periodicity
Arguments
probabilities (numpy.ndarray [shape=(1, 360, time / precision)])
The probabilities for each pitch bin inferred by the network
fmin (float)
The minimum allowable frequency in Hz
fmax (float)
The maximum allowable frequency in Hz
decoder (function)
The decoder to use. See decode.py for decoders.
return_periodicity (bool)
Whether to also return the network confidence
Returns
pitch (numpy.ndarray [shape=(1, 1 + int(time // precision))])
periodicity (numpy.ndarray [shape=(1, 1 + int(time // precision))])
"""
# Convert frequency range to pitch bin range
minidx = onnxcrepe.convert.frequency_to_bins(fmin)
maxidx = onnxcrepe.convert.frequency_to_bins(fmax, np.ceil)
# Remove frequencies outside allowable range
probabilities[:, :minidx] = float('-inf')
probabilities[:, maxidx:] = float('-inf')
# Perform argmax or viterbi sampling
bins, pitch = decoder(probabilities)
if not return_periodicity:
return pitch
# Compute periodicity from probabilities and decoded pitch bins
return pitch, periodicity(probabilities, bins)
###############################################################################
# Utilities
###############################################################################
def periodicity(probabilities, bins):
"""Computes the periodicity from the network output and pitch bins"""
# shape=(time / precision, 360)
probs_stacked = probabilities.transpose(0, 2, 1).reshape(-1, PITCH_BINS)
# shape=(time / precision, 1)
bins_stacked = bins.reshape(-1, 1).astype(np.int64)
# Use maximum logit over pitch bins as periodicity
periodicity = np.take_along_axis(probs_stacked, bins_stacked, axis=1)
# shape=(batch, time / precision)
return periodicity.reshape(probabilities.shape[0], probabilities.shape[2])
def resample(audio, sample_rate):
"""Resample audio"""
return librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)

View File

@ -0,0 +1,80 @@
import librosa
import numpy as np
from voice_changer.RVC.pitchExtractor import onnxcrepe
###############################################################################
# Probability sequence decoding methods
###############################################################################
def argmax(logits):
"""Sample observations by taking the argmax"""
bins = logits.argmax(axis=1)
# Convert to frequency in Hz
return bins, onnxcrepe.convert.bins_to_frequency(bins)
def weighted_argmax(logits: np.ndarray):
"""Sample observations using weighted sum near the argmax"""
# Find center of analysis window
bins = logits.argmax(axis=1)
return bins, _apply_weights(logits, bins)
def viterbi(logits):
"""Sample observations using viterbi decoding"""
# Create viterbi transition matrix
if not hasattr(viterbi, 'transition'):
xx, yy = np.meshgrid(range(360), range(360))
transition = np.maximum(12 - abs(xx - yy), 0)
transition = transition / transition.sum(axis=1, keepdims=True)
viterbi.transition = transition
# Normalize logits (softmax)
logits -= logits.max(axis=1)
exp = np.exp(logits)
probs = exp / np.sum(exp, axis=1)
# Perform viterbi decoding
bins = np.array([
librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
for sequence in probs])
# Convert to frequency in Hz
return bins, onnxcrepe.convert.bins_to_frequency(bins)
def weighted_viterbi(logits):
"""Sample observations combining viterbi decoding and weighted argmax"""
bins, _ = viterbi(logits)
return bins, _apply_weights(logits, bins)
def _apply_weights(logits, bins):
# Find bounds of analysis window
start = np.maximum(0, bins - 4)
end = np.minimum(logits.shape[1], bins + 5)
# Mask out everything outside of window
for batch in range(logits.shape[0]):
for time in range(logits.shape[2]):
logits[batch, :start[batch, time], time] = float('-inf')
logits[batch, end[batch, time]:, time] = float('-inf')
# Construct weights
if not hasattr(_apply_weights, 'weights'):
weights = onnxcrepe.convert.bins_to_cents(np.arange(360))
_apply_weights.weights = weights[None, :, None]
# Convert to probabilities (ReLU)
probs = np.maximum(0, logits)
# Apply weights
cents = (_apply_weights.weights * probs).sum(axis=1) / probs.sum(axis=1)
# Convert to frequency in Hz
return onnxcrepe.convert.cents_to_frequency(cents)

View File

@ -0,0 +1,125 @@
import numpy as np
###############################################################################
# Sequence filters
###############################################################################
def mean(signals, win_length=9):
"""Averave filtering for signals containing nan values
Arguments
signals (numpy.ndarray (shape=(batch, time)))
The signals to filter
win_length
The size of the analysis window
Returns
filtered (numpy.ndarray (shape=(batch, time)))
"""
return nanfilter(signals, win_length, nanmean)
def median(signals, win_length):
"""Median filtering for signals containing nan values
Arguments
signals (numpy.ndarray (shape=(batch, time)))
The signals to filter
win_length
The size of the analysis window
Returns
filtered (numpy.ndarray (shape=(batch, time)))
"""
return nanfilter(signals, win_length, nanmedian)
###############################################################################
# Utilities
###############################################################################
def nanfilter(signals, win_length, filter_fn):
"""Filters a sequence, ignoring nan values
Arguments
signals (numpy.ndarray (shape=(batch, time)))
The signals to filter
win_length
The size of the analysis window
filter_fn (function)
The function to use for filtering
Returns
filtered (numpy.ndarray (shape=(batch, time)))
"""
# Output buffer
filtered = np.empty_like(signals)
# Loop over frames
for i in range(signals.shape[1]):
# Get analysis window bounds
start = max(0, i - win_length // 2)
end = min(signals.shape[1], i + win_length // 2 + 1)
# Apply filter to window
filtered[:, i] = filter_fn(signals[:, start:end])
return filtered
def nanmean(signals):
"""Computes the mean, ignoring nans
Arguments
signals (numpy.ndarray [shape=(batch, time)])
The signals to filter
Returns
filtered (numpy.ndarray [shape=(batch, time)])
"""
signals = signals.clone()
# Find nans
nans = np.isnan(signals)
# Set nans to 0.
signals[nans] = 0.
# Compute average
return signals.sum(axis=1) / (~nans).astype(np.float32).sum(axis=1)
def nanmedian(signals):
"""Computes the median, ignoring nans
Arguments
signals (numpy.ndarray [shape=(batch, time)])
The signals to filter
Returns
filtered (numpy.ndarray [shape=(batch, time)])
"""
# Find nans
nans = np.isnan(signals)
# Compute median for each slice
medians = [nanmedian1d(signal[~nan]) for signal, nan in zip(signals, nans)]
# Stack results
return np.array(medians, dtype=signals.dtype)
def nanmedian1d(signal):
"""Computes the median. If signal is empty, returns torch.nan
Arguments
signal (numpy.ndarray [shape=(time,)])
Returns
median (numpy.ndarray [shape=(1,)])
"""
return np.median(signal) if signal.size else np.nan

View File

@ -0,0 +1,12 @@
import librosa
import numpy as np
def audio(filename):
"""Load audio from disk"""
samples, sr = librosa.load(filename, sr=None)
if len(samples.shape) > 1:
# To mono
samples = np.mean(samples, axis=1)
return samples, sr

View File

@ -0,0 +1,73 @@
import warnings
import librosa
import numpy as np
from voice_changer.RVC.pitchExtractor import onnxcrepe
###############################################################################
# Constants
###############################################################################
# Minimum decibel level
MIN_DB = -100.
# Reference decibel level
REF_DB = 20.
###############################################################################
# A-weighted loudness
###############################################################################
def a_weighted(audio, sample_rate, hop_length=None, pad=True):
"""Retrieve the per-frame loudness"""
# Default hop length of 10 ms
hop_length = sample_rate // 100 if hop_length is None else hop_length
# Convert to numpy
audio = audio.squeeze(0)
# Resample
if sample_rate != onnxcrepe.SAMPLE_RATE:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=onnxcrepe.SAMPLE_RATE)
hop_length = int(hop_length * onnxcrepe.SAMPLE_RATE / sample_rate)
# Cache weights
if not hasattr(a_weighted, 'weights'):
a_weighted.weights = perceptual_weights()
# Take stft
stft = librosa.stft(audio,
n_fft=onnxcrepe.WINDOW_SIZE,
hop_length=hop_length,
win_length=onnxcrepe.WINDOW_SIZE,
center=pad,
pad_mode='constant')
# Compute magnitude on db scale
db = librosa.amplitude_to_db(np.abs(stft))
# Apply A-weighting
weighted = db + a_weighted.weights
# Threshold
weighted[weighted < MIN_DB] = MIN_DB
# Average over weighted frequencies
return weighted.mean(axis=0).astype(np.float32)[None]
def perceptual_weights():
"""A-weighted frequency-dependent perceptual loudness weights"""
frequencies = librosa.fft_frequencies(sr=onnxcrepe.SAMPLE_RATE,
n_fft=onnxcrepe.WINDOW_SIZE)
# A warning is raised for nearly inaudible frequencies, but it ends up
# defaulting to -100 db. That default is fine for our purposes.
with warnings.catch_warnings():
warnings.simplefilter('ignore', RuntimeWarning)
return librosa.A_weighting(frequencies)[:, None] - REF_DB

View File

@ -0,0 +1 @@
modules in this folder from https://github.com/yqzhishen/onnxcrepe at ca7e5d7f2dfca5cc4d99e8d546b00793ca4e7157

View File

@ -0,0 +1,9 @@
import os
import onnxruntime as ort
class CrepeInferenceSession(ort.InferenceSession):
def __init__(self, model='full', sess_options=None, providers=None, provider_options=None, **kwargs):
model_path = os.path.join(os.path.dirname(__file__), 'assets', f'{model}.onnx')
super().__init__(model_path, sess_options, providers, provider_options, **kwargs)

View File

@ -0,0 +1,129 @@
import numpy as np
from voice_changer.RVC.pitchExtractor import onnxcrepe
###############################################################################
# Pitch thresholding methods
###############################################################################
class At:
"""Simple thresholding at a specified probability value"""
def __init__(self, value):
self.value = value
def __call__(self, pitch, periodicity):
# Make a copy to prevent in-place modification
pitch = pitch.copy()
# Threshold
pitch[periodicity < self.value] = onnxcrepe.UNVOICED
return pitch
class Hysteresis:
"""Hysteresis thresholding"""
def __init__(self,
lower_bound=.19,
upper_bound=.31,
width=.2,
stds=1.7,
return_threshold=False):
self.lower_bound = lower_bound
self.upper_bound = upper_bound
self.width = width
self.stds = stds
self.return_threshold = return_threshold
def __call__(self, pitch, periodicity):
# Perform hysteresis in log-2 space
pitch = np.log2(pitch).flatten()
# Flatten periodicity
periodicity = periodicity.flatten()
# Ignore confidently unvoiced pitch
pitch[periodicity < self.lower_bound] = onnxcrepe.UNVOICED
# Whiten pitch
mean, std = np.nanmean(pitch), np.nanstd(pitch)
pitch = (pitch - mean) / std
# Require high confidence to make predictions far from the mean
parabola = self.width * pitch ** 2 - self.width * self.stds ** 2
threshold = self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound)
threshold[np.isnan(threshold)] = self.lower_bound
# Apply hysteresis to prevent short, unconfident voiced regions
i = 0
while i < len(periodicity) - 1:
# Detect unvoiced to voiced transition
if periodicity[i] < threshold[i] and periodicity[i + 1] > threshold[i + 1]:
# Grow region until next unvoiced or end of array
start, end, keep = i + 1, i + 1, False
while end < len(periodicity) and periodicity[end] > threshold[end]:
if periodicity[end] > self.upper_bound:
keep = True
end += 1
# Force unvoiced if we didn't pass the confidence required by
# the hysteresis
if not keep:
threshold[start:end] = 1
i = end
else:
i += 1
# Remove pitch with low periodicity
pitch[periodicity < threshold] = onnxcrepe.UNVOICED
# Unwhiten
pitch = pitch * std + mean
# Convert to Hz
pitch = np.array(2 ** pitch)[None, :]
# Optionally return threshold
if self.return_threshold:
return pitch, np.array(threshold)
return pitch
###############################################################################
# Periodicity thresholding methods
###############################################################################
class Silence:
"""Set periodicity to zero in silent regions"""
def __init__(self, value=-60):
self.value = value
def __call__(self,
periodicity,
audio,
sample_rate=onnxcrepe.SAMPLE_RATE,
precision=None,
pad=True):
# Don't modify in-place
periodicity = periodicity.copy()
# Compute loudness
hop_length = sample_rate * precision // 1000
loudness = onnxcrepe.loudness.a_weighted(
audio, sample_rate, hop_length, pad)
# Threshold silence
periodicity[loudness < self.value] = 0.
return periodicity

View File

@ -12,3 +12,5 @@ class VoiceChangerParams:
hubert_soft: str
nsf_hifigan: str
sample_mode: str
crepe_onnx_full: str
crepe_onnx_tiny: str