From 3a556ebc5fa7185db4094ff3d7204f4348fdc014 Mon Sep 17 00:00:00 2001 From: wataru Date: Fri, 2 Jun 2023 23:48:36 +0900 Subject: [PATCH] optimize starting RVC --- server/voice_changer/RVC/RVC.py | 11 ++-- server/voice_changer/RVC/pipeline/Pipeline.py | 53 +++---------------- server/voice_changer/VoiceChanger.py | 2 - 3 files changed, 14 insertions(+), 52 deletions(-) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 985682df..5e5f70f4 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -284,7 +284,6 @@ class RVC: newData.astype(np.float32) / 32768.0 ) # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) - print("newData", newData.shape, crossfadeSize, solaSearchFrame) if self.audio_buffer is not None: # 過去のデータに連結 self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) @@ -295,10 +294,14 @@ class RVC: inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize ) - print("convertSize1", convertSize) if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (128 - (convertSize % 128)) - print("convertSize2", convertSize) + + # バッファがたまっていない場合はzeroで補う + if self.audio_buffer.shape[0] < convertSize: + self.audio_buffer = np.concatenate( + [np.zeros([convertSize]), self.audio_buffer] + ) convertOffset = -1 * convertSize self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 @@ -319,7 +322,6 @@ class RVC: vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy() vol = max(vol, self.prevVol * 0.0) self.prevVol = vol - print("inf0 : ", audio_buffer.shape, convertSize) return (audio_buffer, convertSize, vol) @@ -347,7 +349,6 @@ class RVC: if vol < self.settings.silentThreshold: return np.zeros(convertSize).astype(np.int16) - print("inf1 : ", audio.shape) audio = torchaudio.functional.resample( audio, self.settings.modelSamplingRate, 16000, rolloff=0.99 ) diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 34a9aa10..f7948cf0 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -14,10 +14,6 @@ from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor -# isHalfが変わる場合はPipeline作り直し -# device(GPU, isHalf変更が伴わない場合), pitchExtractorの変更は、入れ替えで対応 - - class Pipeline(object): embedder: Embedder inferencer: Inferencer @@ -85,12 +81,13 @@ class Pipeline(object): repeat, protect=0.5, ): + # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 + search_index = ( self.index is not None and self.big_npy is not None and index_rate != 0 ) self.t_pad = self.sr * repeat self.t_pad_tgt = self.targetSR * repeat - print("Audio Feature1", audio.shape) # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 audio_pad = F.pad( audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect" ).squeeze(0) @@ -130,21 +127,9 @@ class Pipeline(object): feats = feats.view(1, -1) # embedding - print("audio feature", feats.shape) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) try: - # testFeat = feats.clone() - # while True: - # print("embedding audio;", testFeat.shape) - # testFeatOut = self.embedder.extractFeatures( - # testFeat, embOutputLayer, useFinalProj - # ) - # testFeat = testFeat[:, 1:] - # print("embedding vector;", testFeatOut.shape) - - print("embedding audio;", feats.shape) feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) - print("embedding vector;", feats.shape) except RuntimeError as e: if "HALF" in e.__str__().upper(): raise HalfPrecisionChangingException() @@ -159,30 +144,17 @@ class Pipeline(object): # if self.index is not None and self.feature is not None and index_rate != 0: if search_index: npy = feats[0].cpu().numpy() - print("npy shape", npy.shape, npy.shape[0] * 16000) + # apply silent front for indexsearch npyOffset = math.floor(silence_front * 16000) // 360 - print( - "npyOffset", - silence_front, - self.targetSR, - (silence_front * self.targetSR), - npyOffset, - ) npy = npy[npyOffset:] - print( - "npy trimmed shape", - npy.shape, - ) + if self.isHalf is True: npy = npy.astype("float32") - # D, I = self.index.search(npy, 1) - # npy = self.feature[I.squeeze()] # TODO: kは調整できるようにする k = 1 if k == 1: _, ix = self.index.search(npy, 1) - print("ix shape", ix.shape) npy = self.big_npy[ix.squeeze()] else: score, ix = self.index.search(npy, k=8) @@ -193,11 +165,9 @@ class Pipeline(object): if self.isHalf is True: npy = npy.astype("float16") + # recover silient font npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]), npy]) - print( - "npy last shape", - npy.shape, - ) + feats = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats @@ -207,6 +177,7 @@ class Pipeline(object): feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( 0, 2, 1 ) + # ピッチサイズ調整 p_len = audio_pad.shape[0] // self.window if feats.shape[1] < p_len: @@ -227,22 +198,14 @@ class Pipeline(object): feats = feats.to(feats0.dtype) p_len = torch.tensor([p_len], device=self.device).long() + # apply silent front for inference npyOffset = math.floor(silence_front * 16000) // 360 - print( - "npy last shape2", - feats.shape, - ) feats = feats[:, npyOffset * 2 :, :] feats_len = feats.shape[1] pitch = pitch[:, -feats_len:] pitchf = pitchf[:, -feats_len:] p_len = torch.tensor([feats_len], device=self.device).long() - print( - "npy last shape3", - feats.shape, - feats_len, - ) # 推論実行 try: with torch.no_grad(): diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 50951fbb..1753268f 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -435,7 +435,6 @@ class VoiceChanger: raise RuntimeError("Voice Changer is not selected.") processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() - print("original frame", receivedData.shape[0]) # 前処理 with Timer("pre-process") as t: if self.settings.inputSampleRate != processing_sampling_rate: @@ -453,7 +452,6 @@ class VoiceChanger: sola_search_frame = int(0.012 * processing_sampling_rate) # sola_search_frame = 0 block_frame = newData.shape[0] - print("block frame", newData.shape[0]) crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) self._generate_strength(crossfade_frame)