diff --git a/server/MMVCServerSIO.py b/server/MMVCServerSIO.py index 9a86f488..fff2a31d 100755 --- a/server/MMVCServerSIO.py +++ b/server/MMVCServerSIO.py @@ -239,12 +239,12 @@ if __name__ == "__main__": if sys.platform.startswith("win"): process = subprocess.Popen([NATIVE_CLIENT_FILE_WIN, "--disable-gpu", "-u", f"http://localhost:{PORT}/"]) return_code = process.wait() - logger.info("client closed.") + logger.info("client closed.") p.terminate() elif sys.platform.startswith("darwin"): process = subprocess.Popen([NATIVE_CLIENT_FILE_MAC, "--disable-gpu", "-u", f"http://localhost:{PORT}/"]) return_code = process.wait() - logger.info("client closed.") + logger.info("client closed.") p.terminate() except Exception as e: diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index 8efe10ab..2b617ac7 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -14,7 +14,7 @@ from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager # from voice_changer.RVC.onnxExporter.export2onnx import export2onnx from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager -from Exceptions import DeviceCannotSupportHalfPrecisionException, PipelineCreateException +from Exceptions import DeviceCannotSupportHalfPrecisionException, PipelineCreateException, PipelineNotInitializedException logger = VoiceChangaerLogger.get_instance().getLogger() @@ -28,7 +28,6 @@ class DiffusionSVC(VoiceChangerModel): InferencerManager.initialize(params) self.settings = DiffusionSVCSettings() self.params = params - self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) self.pipeline: Pipeline | None = None @@ -84,6 +83,8 @@ class DiffusionSVC(VoiceChangerModel): if self.pipeline is not None: pipelineInfo = self.pipeline.getPipelineInfo() data["pipelineInfo"] = pipelineInfo + else: + data["pipelineInfo"] = "None" return data def get_processing_sampling_rate(self): @@ -137,6 +138,10 @@ class DiffusionSVC(VoiceChangerModel): return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol) def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int): + if self.pipeline is None: + logger.info("[Voice Changer] Pipeline is not initialized.") + raise PipelineNotInitializedException() + data = self.generate_input(receivedData, crossfade_frame, sola_search_frame) audio: AudioInOut = data[0] pitchf: PitchfInOut = data[1] diff --git a/server/voice_changer/RVC/RVCr2.py b/server/voice_changer/RVC/RVCr2.py new file mode 100644 index 00000000..10e64335 --- /dev/null +++ b/server/voice_changer/RVC/RVCr2.py @@ -0,0 +1,287 @@ +''' +VoiceChangerV2向け +''' +from dataclasses import asdict +import numpy as np +import torch +from data.ModelSlot import RVCModelSlot +from mods.log_control import VoiceChangaerLogger + +from voice_changer.RVC.RVCSettings import RVCSettings +from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager +from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams +from voice_changer.RVC.onnxExporter.export2onnx import export2onnx +from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager +from voice_changer.RVC.pipeline.PipelineGenerator import createPipeline +from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager +from voice_changer.RVC.pipeline.Pipeline import Pipeline + +from Exceptions import DeviceCannotSupportHalfPrecisionException, PipelineCreateException, PipelineNotInitializedException +import resampy +from typing import cast + +logger = VoiceChangaerLogger.get_instance().getLogger() + + +class RVCr2(VoiceChangerModel): + def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot): + logger.info("[Voice Changer] [RVCr2] Creating instance ") + self.deviceManager = DeviceManager.get_instance() + EmbedderManager.initialize(params) + PitchExtractorManager.initialize(params) + self.settings = RVCSettings() + self.params = params + # self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) + + self.pipeline: Pipeline | None = None + + self.audio_buffer: AudioInOut | None = None + self.pitchf_buffer: PitchfInOut | None = None + self.feature_buffer: FeatureInOut | None = None + self.prevVol = 0.0 + self.slotInfo = slotInfo + # self.initialize() + + def initialize(self): + logger.info("[Voice Changer][RVCr2] Initializing... ") + + # pipelineの生成 + try: + self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector) + except PipelineCreateException as e: # NOQA + logger.error("[Voice Changer] pipeline create failed. check your model is valid.") + return + + # その他の設定 + self.settings.tran = self.slotInfo.defaultTune + self.settings.indexRatio = self.slotInfo.defaultIndexRatio + self.settings.protect = self.slotInfo.defaultProtect + logger.info("[Voice Changer] [RVC] Initializing... done") + + def setSamplingRate(self, inputSampleRate, outputSampleRate): + self.inputSampleRate = inputSampleRate + self.outputSampleRate = outputSampleRate + self.initialize() + + def update_settings(self, key: str, val: int | float | str): + logger.info(f"[Voice Changer][RVC]: update_settings {key}:{val}") + if key in self.settings.intData: + setattr(self.settings, key, int(val)) + if key == "gpu": + self.deviceManager.setForceTensor(False) + self.initialize() + elif key in self.settings.floatData: + setattr(self.settings, key, float(val)) + elif key in self.settings.strData: + setattr(self.settings, key, str(val)) + if key == "f0Detector" and self.pipeline is not None: + pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) + self.pipeline.setPitchExtractor(pitchExtractor) + else: + return False + return True + + def get_info(self): + data = asdict(self.settings) + if self.pipeline is not None: + pipelineInfo = self.pipeline.getPipelineInfo() + data["pipelineInfo"] = pipelineInfo + else: + data["pipelineInfo"] = "None" + return data + + def get_processing_sampling_rate(self): + return self.slotInfo.samplingRate + + def generate_input( + self, + newData: AudioInOut, + crossfadeSize: int, + solaSearchFrame: int, + extra_frame: int + ): + # 16k で入ってくる。 + inputSize = newData.shape[0] + newData = newData.astype(np.float32) / 32768.0 + newFeatureLength = inputSize // 160 # hopsize:=160 + + if self.audio_buffer is not None: + # 過去のデータに連結 + self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) + if self.slotInfo.f0: + self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(newFeatureLength)], 0) + self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([newFeatureLength, self.slotInfo.embChannels])], 0) + else: + self.audio_buffer = newData + if self.slotInfo.f0: + self.pitchf_buffer = np.zeros(newFeatureLength) + self.feature_buffer = np.zeros([newFeatureLength, self.slotInfo.embChannels]) + + convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame + + if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 + convertSize = convertSize + (160 - (convertSize % 160)) + outSize = convertSize - extra_frame + + # バッファがたまっていない場合はzeroで補う + if self.audio_buffer.shape[0] < convertSize: + self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) + if self.slotInfo.f0: + self.pitchf_buffer = np.concatenate([np.zeros([convertSize // 160]), self.pitchf_buffer]) + self.feature_buffer = np.concatenate([np.zeros([convertSize // 160, self.slotInfo.embChannels]), self.feature_buffer]) + + # 不要部分をトリミング + convertOffset = -1 * convertSize + featureOffset = convertOffset // 160 + self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 + if self.slotInfo.f0: + self.pitchf_buffer = self.pitchf_buffer[featureOffset:] + self.feature_buffer = self.feature_buffer[featureOffset:] + + # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) + cropOffset = -1 * (inputSize + crossfadeSize) + cropEnd = -1 * (crossfadeSize) + crop = self.audio_buffer[cropOffset:cropEnd] + vol = np.sqrt(np.square(crop).mean()) + vol = max(vol, self.prevVol * 0.0) + self.prevVol = vol + + return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize) + + def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int): + if self.pipeline is None: + logger.info("[Voice Changer] Pipeline is not initialized.") + raise PipelineNotInitializedException() + + # 処理は16Kで実施(Pitch, embed, (infer)) + receivedData = cast( + AudioInOut, + resampy.resample( + receivedData, + self.inputSampleRate, + 16000, + ), + ) + crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000) + sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000) + extra_frame = int((self.settings.extraConvertSize / self.inputSampleRate) * 16000) + + # 入力データ生成 + data = self.generate_input(receivedData, crossfade_frame, sola_search_frame, extra_frame) + + audio = data[0] + pitchf = data[1] + feature = data[2] + convertSize = data[3] + vol = data[4] + outSize = data[5] + + if vol < self.settings.silentThreshold: + return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) + + device = self.pipeline.device + + audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32) + repeat = 1 if self.settings.rvcQuality else 0 + sid = self.settings.dstId + f0_up_key = self.settings.tran + index_rate = self.settings.indexRatio + protect = self.settings.protect + + if_f0 = 1 if self.slotInfo.f0 else 0 + embOutputLayer = self.slotInfo.embOutputLayer + useFinalProj = self.slotInfo.useFinalProj + + try: + audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( + sid, + audio, + pitchf, + feature, + f0_up_key, + index_rate, + if_f0, + self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。 + embOutputLayer, + useFinalProj, + repeat, + protect + ) + outSize = outSize // 16000 * self.slotInfo.samplingRate + result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol) + + result = cast( + AudioInOut, + resampy.resample( + result, + self.slotInfo.samplingRate, + self.outputSampleRate, + ), + ) + + return result + except DeviceCannotSupportHalfPrecisionException as e: # NOQA + logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....") + self.deviceManager.setForceTensor(True) + self.initialize() + # raise e + + return + + def __del__(self): + del self.pipeline + + # print("---------- REMOVING ---------------") + + # remove_path = os.path.join("RVC") + # sys.path = [x for x in sys.path if x.endswith(remove_path) is False] + + # for key in list(sys.modules): + # val = sys.modules.get(key) + # try: + # file_path = val.__file__ + # if file_path.find("RVC" + os.path.sep) >= 0: + # # print("remove", key, file_path) + # sys.modules.pop(key) + # except Exception: # type:ignore + # # print(e) + # pass + + def export2onnx(self): + modelSlot = self.slotInfo + + if modelSlot.isONNX: + logger.warn("[Voice Changer] export2onnx, No pyTorch filepath.") + return {"status": "ng", "path": ""} + + if self.pipeline is not None: + del self.pipeline + self.pipeline = None + + torch.cuda.empty_cache() + self.initialize() + + output_file_simple = export2onnx(self.settings.gpu, modelSlot) + + return { + "status": "ok", + "path": f"/tmp/{output_file_simple}", + "filename": output_file_simple, + } + + def get_model_current(self): + return [ + { + "key": "defaultTune", + "val": self.settings.tran, + }, + { + "key": "defaultIndexRatio", + "val": self.settings.indexRatio, + }, + { + "key": "defaultProtect", + "val": self.settings.protect, + }, + ] diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 72187fdc..fac98260 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -18,6 +18,7 @@ from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor +from voice_changer.utils.Timer import Timer logger = VoiceChangaerLogger.get_instance().getLogger() @@ -89,174 +90,174 @@ class Pipeline(object): protect=0.5, out_size=None, ): - # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 + with Timer("main-process") as t: + # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 + search_index = self.index is not None and self.big_npy is not None and index_rate != 0 + # self.t_pad = self.sr * repeat # 1秒 + # self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される) + audio = audio.unsqueeze(0) - search_index = self.index is not None and self.big_npy is not None and index_rate != 0 - # self.t_pad = self.sr * repeat # 1秒 - # self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される) - audio = audio.unsqueeze(0) + quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。 - quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。 + self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加 + self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される) + audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0) + p_len = audio_pad.shape[0] // self.window + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加 - self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される) - audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0) - p_len = audio_pad.shape[0] // self.window - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + # RVC QualityがOnのときにはsilence_frontをオフに。 + silence_front = silence_front if repeat == 0 else 0 + pitchf = pitchf if repeat == 0 else np.zeros(p_len) + out_size = out_size if repeat == 0 else None - # RVC QualityがOnのときにはsilence_frontをオフに。 - silence_front = silence_front if repeat == 0 else 0 - pitchf = pitchf if repeat == 0 else np.zeros(p_len) - out_size = out_size if repeat == 0 else None - - # ピッチ検出 - try: - if if_f0 == 1: - pitch, pitchf = self.pitchExtractor.extract( - audio_pad, - pitchf, - f0_up_key, - self.sr, - self.window, - silence_front=silence_front, - ) - # pitch = pitch[:p_len] - # pitchf = pitchf[:p_len] - pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() - pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0) - else: - pitch = None - pitchf = None - except IndexError as e: # NOQA - # print(e) - # import traceback - # traceback.print_exc() - raise NotEnoughDataExtimateF0() - - # tensor型調整 - feats = audio_pad - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - feats = feats.view(1, -1) - - # embedding - with autocast(enabled=self.isHalf): + # ピッチ検出 try: - feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) - if torch.isnan(feats).all(): - raise DeviceCannotSupportHalfPrecisionException() + if if_f0 == 1: + pitch, pitchf = self.pitchExtractor.extract( + audio_pad, + pitchf, + f0_up_key, + self.sr, + self.window, + silence_front=silence_front, + ) + # pitch = pitch[:p_len] + # pitchf = pitchf[:p_len] + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0) + else: + pitch = None + pitchf = None + except IndexError as e: # NOQA + # print(e) + # import traceback + # traceback.print_exc() + raise NotEnoughDataExtimateF0() + + # tensor型調整 + feats = audio_pad + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + + # embedding + with autocast(enabled=self.isHalf): + try: + feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) + if torch.isnan(feats).all(): + raise DeviceCannotSupportHalfPrecisionException() + except RuntimeError as e: + if "HALF" in e.__str__().upper(): + raise HalfPrecisionChangingException() + elif "same device" in e.__str__(): + raise DeviceChangingException() + else: + raise e + + # Index - feature抽出 + # if self.index is not None and self.feature is not None and index_rate != 0: + if search_index: + npy = feats[0].cpu().numpy() + # apply silent front for indexsearch + npyOffset = math.floor(silence_front * 16000) // 360 + npy = npy[npyOffset:] + + if self.isHalf is True: + npy = npy.astype("float32") + + # TODO: kは調整できるようにする + k = 1 + if k == 1: + _, ix = self.index.search(npy, 1) + npy = self.big_npy[ix.squeeze()] + else: + score, ix = self.index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + # recover silient font + npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:] + feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and search_index: + feats0 = feats.clone() + + # ピッチサイズ調整 + p_len = audio_pad.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + feats_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, -feats_len:] + pitchf = pitchf[:, -feats_len:] + p_len = torch.tensor([feats_len], device=self.device).long() + + # pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる + # pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。 + # https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929 + if protect < 0.5 and search_index: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + + # apply silent front for inference + if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]: + npyOffset = math.floor(silence_front * 16000) // 360 + feats = feats[:, npyOffset * 2 :, :] # NOQA + + feats_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, -feats_len:] + pitchf = pitchf[:, -feats_len:] + p_len = torch.tensor([feats_len], device=self.device).long() + + # 推論実行 + try: + with torch.no_grad(): + with autocast(enabled=self.isHalf): + audio1 = ( + torch.clip( + self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32), + -1.0, + 1.0, + ) + * 32767.5 + ).data.to(dtype=torch.int16) except RuntimeError as e: if "HALF" in e.__str__().upper(): + print("11", e) raise HalfPrecisionChangingException() - elif "same device" in e.__str__(): - raise DeviceChangingException() else: raise e - # Index - feature抽出 - # if self.index is not None and self.feature is not None and index_rate != 0: - if search_index: - npy = feats[0].cpu().numpy() - # apply silent front for indexsearch - npyOffset = math.floor(silence_front * 16000) // 360 - npy = npy[npyOffset:] - - if self.isHalf is True: - npy = npy.astype("float32") - - # TODO: kは調整できるようにする - k = 1 - if k == 1: - _, ix = self.index.search(npy, 1) - npy = self.big_npy[ix.squeeze()] + feats_buffer = feats.squeeze(0).detach().cpu() + if pitchf is not None: + pitchf_buffer = pitchf.squeeze(0).detach().cpu() else: - score, ix = self.index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + pitchf_buffer = None - # recover silient font - npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:] - feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - if protect < 0.5 and search_index: - feats0 = feats.clone() + del p_len, pitch, pitchf, feats + # torch.cuda.empty_cache() - # ピッチサイズ調整 - p_len = audio_pad.shape[0] // self.window - if feats.shape[1] < p_len: - p_len = feats.shape[1] - if pitch is not None and pitchf is not None: - pitch = pitch[:, :p_len] - pitchf = pitchf[:, :p_len] + # inferで出力されるサンプリングレートはモデルのサンプリングレートになる。 + # pipelineに(入力されるときはhubertように16k) + if self.t_pad_tgt != 0: + offset = self.t_pad_tgt + end = -1 * self.t_pad_tgt + audio1 = audio1[offset:end] - feats_len = feats.shape[1] - if pitch is not None and pitchf is not None: - pitch = pitch[:, -feats_len:] - pitchf = pitchf[:, -feats_len:] - p_len = torch.tensor([feats_len], device=self.device).long() - - # pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる - # pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。 - # https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929 - if protect < 0.5 and search_index: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) - p_len = torch.tensor([p_len], device=self.device).long() - - # apply silent front for inference - if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]: - npyOffset = math.floor(silence_front * 16000) // 360 - feats = feats[:, npyOffset * 2 :, :] # NOQA - - feats_len = feats.shape[1] - if pitch is not None and pitchf is not None: - pitch = pitch[:, -feats_len:] - pitchf = pitchf[:, -feats_len:] - p_len = torch.tensor([feats_len], device=self.device).long() - - # 推論実行 - try: - with torch.no_grad(): - with autocast(enabled=self.isHalf): - audio1 = ( - torch.clip( - self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32), - -1.0, - 1.0, - ) - * 32767.5 - ).data.to(dtype=torch.int16) - except RuntimeError as e: - if "HALF" in e.__str__().upper(): - print("11", e) - raise HalfPrecisionChangingException() - else: - raise e - - feats_buffer = feats.squeeze(0).detach().cpu() - if pitchf is not None: - pitchf_buffer = pitchf.squeeze(0).detach().cpu() - else: - pitchf_buffer = None - - del p_len, pitch, pitchf, feats - # torch.cuda.empty_cache() - - # inferで出力されるサンプリングレートはモデルのサンプリングレートになる。 - # pipelineに(入力されるときはhubertように16k) - if self.t_pad_tgt != 0: - offset = self.t_pad_tgt - end = -1 * self.t_pad_tgt - audio1 = audio1[offset:end] - - del sid - # torch.cuda.empty_cache() + del sid + # torch.cuda.empty_cache() return audio1, pitchf_buffer, feats_buffer def __del__(self): diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index ad0f2f8f..5aa25d3c 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -214,11 +214,18 @@ class VoiceChangerManager(ServerDeviceCallbacks): return elif slotInfo.voiceChangerType == "RVC": logger.info("................RVC") - from voice_changer.RVC.RVC import RVC + # from voice_changer.RVC.RVC import RVC - self.voiceChangerModel = RVC(self.params, slotInfo) - self.voiceChanger = VoiceChanger(self.params) + # self.voiceChangerModel = RVC(self.params, slotInfo) + # self.voiceChanger = VoiceChanger(self.params) + # self.voiceChanger.setModel(self.voiceChangerModel) + + from voice_changer.RVC.RVCr2 import RVCr2 + + self.voiceChangerModel = RVCr2(self.params, slotInfo) + self.voiceChanger = VoiceChangerV2(self.params) self.voiceChanger.setModel(self.voiceChangerModel) + elif slotInfo.voiceChangerType == "MMVCv13": logger.info("................MMVCv13") from voice_changer.MMVCv13.MMVCv13 import MMVCv13 diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py index 68ea872e..ba151bbe 100644 --- a/server/voice_changer/VoiceChangerV2.py +++ b/server/voice_changer/VoiceChangerV2.py @@ -6,7 +6,7 @@ - 適用VoiceChangerModel ・DiffusionSVC - +・RVC ''' from typing import Any, Union