From 6a01467ac8c415c60539e880ed6986784e948598 Mon Sep 17 00:00:00 2001 From: wataru Date: Tue, 2 May 2023 22:29:28 +0900 Subject: [PATCH] WIP: Japanese Hubert --- server/const.py | 5 ++ server/voice_changer/RVC/RVC.py | 38 +++++++--- .../RVC/custom_vc_infer_pipeline.py | 76 ++++--------------- .../RVC/embedder/EmbedderManager.py | 16 +++- .../RVC/embedder/FairseqHubert.py | 2 - .../RVC/inferencer/InferencerManager.py | 34 ++++++--- .../RVC/pitchExtractor/DioPitchExtractor.py | 42 ++++++++++ .../pitchExtractor/HarvestPitchExtractor.py | 43 +++++++++++ .../RVC/pitchExtractor/PitchExtractor.py | 9 +++ .../pitchExtractor/PitchExtractorManager.py | 36 +++++++++ 10 files changed, 214 insertions(+), 87 deletions(-) create mode 100644 server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py create mode 100644 server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py create mode 100644 server/voice_changer/RVC/pitchExtractor/PitchExtractor.py create mode 100644 server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py diff --git a/server/const.py b/server/const.py index 6ba67521..35de72ea 100644 --- a/server/const.py +++ b/server/const.py @@ -82,6 +82,11 @@ class EnumInferenceTypes(Enum): onnxRVCNono = "onnxRVCNono" +class EnumPitchExtractorTypes(Enum): + harvest = "harvest" + dio = "dio" + + class EnumFrameworkTypes(Enum): pyTorch = "pyTorch" onnx = "onnx" diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 7f8816c7..c0b2f90c 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -1,6 +1,9 @@ import sys import os +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor +from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager + # avoiding parse arg error in RVC sys.argv = ["MMVCServerSIO.py"] @@ -55,10 +58,14 @@ class RVC: audio_buffer: AudioInOut | None = None embedder: Embedder | None = None inferencer: Inferencer | None = None + pitchExtractor: PitchExtractor | None = None def __init__(self, params: VoiceChangerParams): self.initialLoad = True self.settings = RVCSettings() + self.pitchExtractor = PitchExtractorManager.getPitchExtractor( + self.settings.f0Detector + ) self.feature_file = None self.index_file = None @@ -102,6 +109,15 @@ class RVC: return self.get_info() + def _getDevice(self): + if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False): + dev = torch.device("cpu") + elif self.mps_enabled: + dev = torch.device("mps") + else: + dev = torch.device("cuda", index=self.settings.gpu) + return dev + def prepareModel(self, slot: int): if slot < 0: return self.get_info() @@ -110,20 +126,14 @@ class RVC: filename = ( modelSlot.onnxModelFile if modelSlot.isONNX else modelSlot.pyTorchModelFile ) - - if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False): - dev = torch.device("cpu") - elif self.mps_enabled: - dev = torch.device("mps") - else: - dev = torch.device("cuda", index=self.settings.gpu) + dev = self._getDevice() # Inferencerのロード inferencer = InferencerManager.getInferencer( modelSlot.modelType, filename, self.settings.isHalf, - torch.device("cuda:0"), + dev, ) self.next_inferencer = inferencer @@ -156,8 +166,14 @@ class RVC: def switchModel(self): print("[Voice Changer] Switching model..") - # del self.net_g - # del self.onnx_session + if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False): + dev = torch.device("cpu") + elif self.mps_enabled: + dev = torch.device("mps") + else: + dev = torch.device("cuda", index=self.settings.gpu) + + # embedderはモデルによらず再利用できる可能性が高いので、Switchのタイミングでこちらで取得 try: self.embedder = EmbedderManager.getEmbedder( self.next_embedder, @@ -330,6 +346,7 @@ class RVC: # self.hubert_model, self.embedder, self.onnx_session, + self.pitchExtractor, sid, audio, f0_up_key, @@ -391,6 +408,7 @@ class RVC: audio_out = vc.pipeline( self.embedder, self.inferencer, + self.pitchExtractor, sid, audio, f0_up_key, diff --git a/server/voice_changer/RVC/custom_vc_infer_pipeline.py b/server/voice_changer/RVC/custom_vc_infer_pipeline.py index 91570348..88fdc8e6 100644 --- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py +++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py @@ -3,10 +3,10 @@ import numpy as np # import parselmouth import torch import torch.nn.functional as F -import scipy.signal as signal -import pyworld from voice_changer.RVC.embedder.Embedder import Embedder +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor class VC(object): @@ -18,62 +18,11 @@ class VC(object): self.device = device self.is_half = is_half - def get_f0(self, audio, p_len, f0_up_key, f0_method, silence_front=0): - n_frames = int(len(audio) // self.window) + 1 - start_frame = int(silence_front * self.sr / self.window) - real_silence_front = start_frame * self.window / self.sr - - silence_front_offset = int(np.round(real_silence_front * self.sr)) - audio = audio[silence_front_offset:] - - # time_step = self.window / self.sr * 1000 - f0_min = 50 - f0_max = 1100 - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - if f0_method == "dio": - _f0, t = pyworld.dio( - audio.astype(np.double), - self.sr, - f0_floor=f0_min, - f0_ceil=f0_max, - channels_in_octave=2, - frame_period=10, - ) - f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, self.sr) - f0 = np.pad( - f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame) - ) - else: - f0, t = pyworld.harvest( - audio.astype(np.double), - fs=self.sr, - f0_ceil=f0_max, - frame_period=10, - ) - f0 = pyworld.stonemask(audio.astype(np.double), f0, t, self.sr) - f0 = signal.medfilt(f0, 3) - - f0 = np.pad( - f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame) - ) - - f0 *= pow(2, f0_up_key / 12) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int) - - return f0_coarse, f0bak - def pipeline( self, embedder: Embedder, - model, + inferencer: Inferencer, + pitchExtractor: PitchExtractor, sid, audio, f0_up_key, @@ -92,11 +41,11 @@ class VC(object): # ピッチ検出 pitch, pitchf = None, None if if_f0 == 1: - pitch, pitchf = self.get_f0( + pitch, pitchf = pitchExtractor.extract( audio_pad, - p_len, f0_up_key, - f0_method, + self.sr, + self.window, silence_front=silence_front, ) pitch = pitch[:p_len] @@ -156,16 +105,19 @@ class VC(object): with torch.no_grad(): if pitch is not None: audio1 = ( - (model.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) + ( + inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] + * 32768 + ) .data.cpu() .float() .numpy() .astype(np.int16) ) else: - if hasattr(model, "infer_pitchless"): + if hasattr(inferencer, "infer_pitchless"): audio1 = ( - (model.infer_pitchless(feats, p_len, sid)[0][0, 0] * 32768) + (inferencer.infer_pitchless(feats, p_len, sid)[0][0, 0] * 32768) .data.cpu() .float() .numpy() @@ -173,7 +125,7 @@ class VC(object): ) else: audio1 = ( - (model.infer(feats, p_len, sid)[0][0, 0] * 32768) + (inferencer.infer(feats, p_len, sid)[0][0, 0] * 32768) .data.cpu() .float() .numpy() diff --git a/server/voice_changer/RVC/embedder/EmbedderManager.py b/server/voice_changer/RVC/embedder/EmbedderManager.py index a85b067d..729b84cb 100644 --- a/server/voice_changer/RVC/embedder/EmbedderManager.py +++ b/server/voice_changer/RVC/embedder/EmbedderManager.py @@ -29,12 +29,20 @@ class EmbedderManager: def loadEmbedder( cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device ) -> Embedder: - if embederType == EnumEmbedderTypes.hubert: + if ( + embederType == EnumEmbedderTypes.hubert + or embederType == EnumEmbedderTypes.hubert.value + ): return FairseqHubert().loadModel(file, dev, isHalf) - elif embederType == EnumEmbedderTypes.hubert_jp: # same as hubert + elif ( + embederType == EnumEmbedderTypes.hubert_jp + or embederType == EnumEmbedderTypes.hubert_jp.value + ): return FairseqHubertJp().loadModel(file, dev, isHalf) - elif embederType == EnumEmbedderTypes.contentvec: # same as hubert + elif ( + embederType == EnumEmbedderTypes.contentvec + or embederType == EnumEmbedderTypes.contentvec.value + ): return FairseqContentvec().loadModel(file, dev, isHalf) else: - # return hubert as default return FairseqHubert().loadModel(file, dev, isHalf) diff --git a/server/voice_changer/RVC/embedder/FairseqHubert.py b/server/voice_changer/RVC/embedder/FairseqHubert.py index ea4fd3ff..2a981502 100644 --- a/server/voice_changer/RVC/embedder/FairseqHubert.py +++ b/server/voice_changer/RVC/embedder/FairseqHubert.py @@ -37,8 +37,6 @@ class FairseqHubert(Embedder): "padding_mask": padding_mask, } - print("feat dev", self.dev) - with torch.no_grad(): logits = self.model.extract_features(**inputs) if embChannels == 256: diff --git a/server/voice_changer/RVC/inferencer/InferencerManager.py b/server/voice_changer/RVC/inferencer/InferencerManager.py index 7f2203cc..12daabe6 100644 --- a/server/voice_changer/RVC/inferencer/InferencerManager.py +++ b/server/voice_changer/RVC/inferencer/InferencerManager.py @@ -1,7 +1,6 @@ from torch import device from const import EnumInferenceTypes -from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInference from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferenceNono @@ -24,19 +23,36 @@ class InferencerManager: @classmethod def loadInferencer( cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device - ) -> Embedder: - if inferencerType == EnumInferenceTypes.pyTorchRVC: + ) -> Inferencer: + if ( + inferencerType == EnumInferenceTypes.pyTorchRVC + or inferencerType == EnumInferenceTypes.pyTorchRVC.value + ): return RVCInferencer().loadModel(file, dev, isHalf) - elif inferencerType == EnumInferenceTypes.pyTorchRVCNono: + elif ( + inferencerType == EnumInferenceTypes.pyTorchRVCNono + or inferencerType == EnumInferenceTypes.pyTorchRVCNono.value + ): return RVCInferencerNono().loadModel(file, dev, isHalf) - elif inferencerType == EnumInferenceTypes.pyTorchWebUI: + elif ( + inferencerType == EnumInferenceTypes.pyTorchWebUI + or inferencerType == EnumInferenceTypes.pyTorchWebUI.value + ): return WebUIInferencer().loadModel(file, dev, isHalf) - elif inferencerType == EnumInferenceTypes.pyTorchWebUINono: + elif ( + inferencerType == EnumInferenceTypes.pyTorchWebUINono + or inferencerType == EnumInferenceTypes.pyTorchWebUINono.value + ): return WebUIInferencerNono().loadModel(file, dev, isHalf) - elif inferencerType == EnumInferenceTypes.onnxRVC: + elif ( + inferencerType == EnumInferenceTypes.onnxRVC + or inferencerType == EnumInferenceTypes.onnxRVC.value + ): return OnnxRVCInference().loadModel(file, dev, isHalf) - elif inferencerType == EnumInferenceTypes.onnxRVCNono: + elif ( + inferencerType == EnumInferenceTypes.onnxRVCNono + or inferencerType == EnumInferenceTypes.onnxRVCNono.value + ): return OnnxRVCInferenceNono().loadModel(file, dev, isHalf) else: - # return hubert as default raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType) diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py new file mode 100644 index 00000000..eafc72be --- /dev/null +++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py @@ -0,0 +1,42 @@ +import pyworld +import numpy as np + +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor + + +class DioPitchExtractor(PitchExtractor): + def extract(self, audio, f0_up_key, sr, window, silence_front=0): + n_frames = int(len(audio) // window) + 1 + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + silence_front_offset = int(np.round(real_silence_front * sr)) + audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + _f0, t = pyworld.dio( + audio.astype(np.double), + sr, + f0_floor=f0_min, + f0_ceil=f0_max, + channels_in_octave=2, + frame_period=10, + ) + f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr) + f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) + + f0 *= pow(2, f0_up_key / 12) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + + return f0_coarse, f0bak diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py new file mode 100644 index 00000000..4043092f --- /dev/null +++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py @@ -0,0 +1,43 @@ +import pyworld +import numpy as np +import scipy.signal as signal + +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor + + +class HarvestPitchExtractor(PitchExtractor): + def extract(self, audio, f0_up_key, sr, window, silence_front=0): + n_frames = int(len(audio) // window) + 1 + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + silence_front_offset = int(np.round(real_silence_front * sr)) + audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0, t = pyworld.harvest( + audio.astype(np.double), + fs=sr, + f0_ceil=f0_max, + frame_period=10, + ) + f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr) + f0 = signal.medfilt(f0, 3) + + f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) + + f0 *= pow(2, f0_up_key / 12) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + + return f0_coarse, f0bak diff --git a/server/voice_changer/RVC/pitchExtractor/PitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/PitchExtractor.py new file mode 100644 index 00000000..9f06a682 --- /dev/null +++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractor.py @@ -0,0 +1,9 @@ +from typing import Protocol +from const import EnumPitchExtractorTypes + + +class PitchExtractor(Protocol): + pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest + + def extract(self, audio, f0_up_key, sr, window, silence_front=0): + ... diff --git a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py new file mode 100644 index 00000000..3c4ab1e2 --- /dev/null +++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py @@ -0,0 +1,36 @@ +from typing import Protocol +from const import EnumPitchExtractorTypes +from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor +from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor + + +class PitchExtractorManager(Protocol): + currentPitchExtractor: PitchExtractor | None = None + + @classmethod + def getPitchExtractor( + cls, pitchExtractorType: EnumPitchExtractorTypes + ) -> PitchExtractor: + cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType) + return cls.currentPitchExtractor + + @classmethod + def loadPitchExtractor( + cls, pitchExtractorType: EnumPitchExtractorTypes + ) -> PitchExtractor: + if ( + pitchExtractorType == EnumPitchExtractorTypes.harvest + or pitchExtractorType == EnumPitchExtractorTypes.harvest.value + ): + return HarvestPitchExtractor() + elif ( + pitchExtractorType == EnumPitchExtractorTypes.dio + or pitchExtractorType == EnumPitchExtractorTypes.dio.value + ): + return DioPitchExtractor() + else: + # return hubert as default + raise RuntimeError( + "[Voice Changer] PitchExctractor not found", pitchExtractorType + )