diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index 7f371eb2..d202c000 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -104,7 +104,7 @@ class DiffusionSVC(VoiceChangerModel): convertSize = convertSize + (128 - (convertSize % 128)) # バッファがたまっていない場合はzeroで補う - generateFeatureLength = int(((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) + 1 + generateFeatureLength = int(((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) + 1 if self.audio_buffer.shape[0] < convertSize: self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) self.pitchf_buffer = np.concatenate([np.zeros(generateFeatureLength), self.pitchf_buffer]) @@ -151,7 +151,7 @@ class DiffusionSVC(VoiceChangerModel): speedUp = self.settings.speedUp embOutputLayer = 12 useFinalProj = False - silenceFrontSec = self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0. # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。 + silenceFrontSec = self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0. # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。 try: audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( @@ -169,7 +169,6 @@ class DiffusionSVC(VoiceChangerModel): protect ) result = audio_out.detach().cpu().numpy() - return result except DeviceCannotSupportHalfPrecisionException as e: # NOQA print("[Device Manager] Device cannot support half precision. Fallback to float....") diff --git a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py index eea196c4..f54f024c 100644 --- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py +++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py @@ -125,7 +125,6 @@ class DiffusionSVCInferencer(Inferencer): with Timer("pre-process") as t: # NOQA start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) - out_wav *= mask # print("[ ----Timer::3: ]", t.secs, start_frame, out_mel.shape) diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py index 2d486956..c6a8dda1 100644 --- a/server/voice_changer/Local/ServerDevice.py +++ b/server/voice_changer/Local/ServerDevice.py @@ -271,7 +271,9 @@ class ServerDevice: # Blockサイズを計算 currentInputChunkNum = self.settings.serverReadChunkSize - block_frame = currentInputChunkNum * 128 + # block_frame = currentInputChunkNum * 128 + block_frame = int(currentInputChunkNum * 128 * (self.settings.serverInputAudioSampleRate / 48000)) + sd.default.blocksize = block_frame # main loop diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py index 1f3c7fe2..5ed0a418 100644 --- a/server/voice_changer/VoiceChangerV2.py +++ b/server/voice_changer/VoiceChangerV2.py @@ -260,7 +260,7 @@ class VoiceChangerV2(VoiceChangerIF): print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz") - if receivedData.shape[0] != result .shape[0]: + if receivedData.shape[0] != result.shape[0]: outputData = pad_array(result, receivedData.shape[0]) pass else: