diff --git a/server/voice_changer/DDSP_SVC/DDSP_SVC.py b/server/voice_changer/DDSP_SVC/DDSP_SVC.py index d36aa092..54d78318 100644 --- a/server/voice_changer/DDSP_SVC/DDSP_SVC.py +++ b/server/voice_changer/DDSP_SVC/DDSP_SVC.py @@ -24,7 +24,7 @@ from slicer import Slicer import librosa providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] - +import resampy from scipy.io import wavfile SAMPLING_RATE = 44100 @@ -234,7 +234,7 @@ class DDSP_SVC: return np.zeros(convertSize).astype(np.int16) with torch.no_grad(): - spk_id = torch.LongTensor(np.array([[int(2)]])) + spk_id = torch.LongTensor(np.array([[int(1)]])) seg_output, _, (s_h, s_n) = self.model(c, f0, volume, spk_id=spk_id, spk_mix_dict=None) seg_output *= mask @@ -245,7 +245,6 @@ class DDSP_SVC: self.args.data.block_size, adaptive_key=float(0)) result = seg_output.squeeze().cpu().numpy() * 32768.0 - return np.array(result).astype(np.int16) def inference(self, data): diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 97bb327a..3441c56b 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -244,12 +244,10 @@ class VoiceChanger(): self.ioRecorder.writeOutput(outputData.tobytes()) # if receivedData.shape[0] != outputData.shape[0]: + # print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}") # outputData = pad_array(outputData, receivedData.shape[0]) # # print_convert_processing( # # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") - # print( - # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") - postprocess_time = t.secs print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")