diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index df74a556..6dfbbe74 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -1,9 +1,5 @@ -# import sys -# import os from dataclasses import asdict import numpy as np -import torch -import torchaudio from data.ModelSlot import DiffusionSVCModelSlot from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline @@ -13,7 +9,7 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import Pitc from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.RVC.onnxExporter.export2onnx import export2onnx +# from voice_changer.RVC.onnxExporter.export2onnx import export2onnx from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from Exceptions import DeviceCannotSupportHalfPrecisionException @@ -36,13 +32,12 @@ class DiffusionSVC(VoiceChangerModel): self.feature_buffer: FeatureInOut | None = None self.prevVol = 0.0 self.slotInfo = slotInfo - self.initialize() - + def initialize(self): print("[Voice Changer] [DiffusionSVC] Initializing... ") # pipelineの生成 - self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector) + self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector, self.inputSampleRate, self.outputSampleRate) # その他の設定 self.settings.tran = self.slotInfo.defaultTune @@ -51,6 +46,11 @@ class DiffusionSVC(VoiceChangerModel): print("[Voice Changer] [DiffusionSVC] Initializing... done") + def setSamplingRate(self, inputSampleRate, outputSampleRate): + self.inputSampleRate = inputSampleRate + self.outputSampleRate = outputSampleRate + self.initialize() + def update_settings(self, key: str, val: int | float | str): print("[Voice Changer][DiffusionSVC]: update_settings", key, val) if key in self.settings.intData: @@ -82,7 +82,6 @@ class DiffusionSVC(VoiceChangerModel): def generate_input( self, newData: AudioInOut, - inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0, ): @@ -99,11 +98,10 @@ class DiffusionSVC(VoiceChangerModel): self.pitchf_buffer = np.zeros(new_feature_length) self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels]) - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + convertSize = newData.shape[0] + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (128 - (convertSize % 128)) - outSize = convertSize - self.settings.extraConvertSize # バッファがたまっていない場合はzeroで補う if self.audio_buffer.shape[0] < convertSize: @@ -118,37 +116,39 @@ class DiffusionSVC(VoiceChangerModel): self.feature_buffer = self.feature_buffer[featureOffset:] # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) - cropOffset = -1 * (inputSize + crossfadeSize) + cropOffset = -1 * (newData.shape[0] + crossfadeSize) cropEnd = -1 * (crossfadeSize) crop = self.audio_buffer[cropOffset:cropEnd] vol = np.sqrt(np.square(crop).mean()) - vol = max(vol, self.prevVol * 0.0) + vol = float(max(vol, self.prevVol * 0.0)) self.prevVol = vol - return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize) + return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol) - def inference(self, data): - audio = data[0] - pitchf = data[1] - feature = data[2] - convertSize = data[3] - vol = data[4] + def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int): + data = self.generate_input(receivedData, crossfade_frame, sola_search_frame) + audio: AudioInOut = data[0] + pitchf: PitchfInOut = data[1] + feature: FeatureInOut = data[2] + convertSize: int = data[3] + vol: float = data[4] if vol < self.settings.silentThreshold: return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) - if self.pipeline is not None: - device = self.pipeline.device - else: - device = torch.device("cpu") # TODO:pipelineが存在しない場合はzeroを返してもいいかも(要確認)。 - audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32) - audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99) + if self.pipeline is None: + return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) + + # device = self.pipeline.device + # audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32) + # audio = self.resampler16K(audio) sid = self.settings.dstId f0_up_key = self.settings.tran protect = 0 embOutputLayer = 12 useFinalProj = False + silenceFrontSec = self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0. # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。 try: audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( @@ -157,12 +157,11 @@ class DiffusionSVC(VoiceChangerModel): pitchf, feature, f0_up_key, - self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。 + silenceFrontSec, embOutputLayer, useFinalProj, protect ) - # result = audio_out.detach().cpu().numpy() * np.sqrt(vol) result = audio_out.detach().cpu().numpy() return result @@ -173,36 +172,36 @@ class DiffusionSVC(VoiceChangerModel): # raise e return - + def __del__(self): del self.pipeline - def export2onnx(self): - modelSlot = self.slotInfo + # def export2onnx(self): + # modelSlot = self.slotInfo - if modelSlot.isONNX: - print("[Voice Changer] export2onnx, No pyTorch filepath.") - return {"status": "ng", "path": ""} + # if modelSlot.isONNX: + # print("[Voice Changer] export2onnx, No pyTorch filepath.") + # return {"status": "ng", "path": ""} - output_file_simple = export2onnx(self.settings.gpu, modelSlot) - return { - "status": "ok", - "path": f"/tmp/{output_file_simple}", - "filename": output_file_simple, - } + # output_file_simple = export2onnx(self.settings.gpu, modelSlot) + # return { + # "status": "ok", + # "path": f"/tmp/{output_file_simple}", + # "filename": output_file_simple, + # } - def get_model_current(self): - return [ - { - "key": "defaultTune", - "val": self.settings.tran, - }, - { - "key": "defaultIndexRatio", - "val": self.settings.indexRatio, - }, - { - "key": "defaultProtect", - "val": self.settings.protect, - }, - ] + # def get_model_current(self): + # return [ + # { + # "key": "defaultTune", + # "val": self.settings.tran, + # }, + # { + # "key": "defaultIndexRatio", + # "val": self.settings.indexRatio, + # }, + # { + # "key": "defaultProtect", + # "val": self.settings.protect, + # }, + # ] diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py index c126f575..b67e5746 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py +++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py @@ -15,6 +15,9 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtrac from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.common.VolumeExtractor import VolumeExtractor +from torchaudio.transforms import Resample + +from voice_changer.utils.Timer import Timer class Pipeline(object): @@ -39,16 +42,22 @@ class Pipeline(object): targetSR, device, isHalf, + resamplerIn: Resample, + resamplerOut: Resample ): self.inferencer = inferencer inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig() - self.hop_size = inferencer_block_size * 16000 / inferencer_sampling_rate # 16000はオーディオのサンプルレート。この時点で16Kになっている。 + self.hop_size = inferencer_block_size * 16000 / inferencer_sampling_rate # 16000はオーディオのサンプルレート。16Kで処理 self.inferencer_block_size = inferencer_block_size self.inferencer_sampling_rate = inferencer_sampling_rate self.volumeExtractor = VolumeExtractor(self.hop_size) self.embedder = embedder self.pitchExtractor = pitchExtractor + + self.resamplerIn = resamplerIn + self.resamplerOut = resamplerOut + # self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100) print("VOLUME EXTRACTOR", self.volumeExtractor) @@ -83,10 +92,28 @@ class Pipeline(object): self.pitchExtractor = pitchExtractor @torch.no_grad() - def extract_volume_and_mask(self, audio, threhold): - volume = self.volumeExtractor.extract(audio) - mask = self.volumeExtractor.get_mask_from_volume(volume, self.inferencer_block_size, threhold=threhold, device=self.device) - volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0) + def extract_volume_and_mask(self, audio: torch.Tensor, threshold: float): + ''' + with Timer("[VolumeExt np]") as t: + for i in range(100): + volume = self.volumeExtractor.extract(audio) + time_np = t.secs + with Timer("[VolumeExt pt]") as t: + for i in range(100): + volume_t = self.volumeExtractor.extract_t(audio) + time_pt = t.secs + + print("[Volume np]:", volume) + print("[Volume pt]:", volume_t) + print("[Perform]:", time_np, time_pt) + # -> [Perform]: 0.030178070068359375 0.005780220031738281 (RTX4090) + # -> [Perform]: 0.029046058654785156 0.0025115013122558594 (CPU i9 13900KF) + # ---> これくらいの処理ならCPU上のTorchでやった方が早い? + ''' + # volume_t = self.volumeExtractor.extract_t(audio) + volume_t = self.volumeExtractor.extract_t(audio) + mask = self.volumeExtractor.get_mask_from_volume_t(volume_t, self.inferencer_block_size, threshold=threshold) + volume = volume_t.unsqueeze(-1).unsqueeze(0) return volume, mask def exec( @@ -101,24 +128,20 @@ class Pipeline(object): useFinalProj, protect=0.5 ): - # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 - audio = audio.unsqueeze(0) - self.t_pad = 0 - audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0) + audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) + audio16k = self.resamplerIn(audio_t) + volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0) sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - - n_frames = int(audio_pad.size(-1) // self.hop_size + 1) - volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0) + n_frames = int(audio16k.size(-1) // self.hop_size + 1) # ピッチ検出 try: # print("[SRC AUDIO----]", audio_pad) pitch, pitchf = self.pitchExtractor.extract( - audio_pad, + audio16k.squeeze(), pitchf, f0_up_key, 16000, # 音声のサンプリングレート(既に16000) - # int(self.hop_size), # 処理のwindowサイズ (44100における512) int(self.hop_size), # 処理のwindowサイズ (44100における512) silence_front=silence_front, ) @@ -128,15 +151,19 @@ class Pipeline(object): except IndexError as e: # NOQA # print(e) raise NotEnoughDataExtimateF0() + print("[EMBEDDER EXTRACT:audio:4:]", audio_t.shape) # f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100) # print("[Pitch_f0]", f0) # tensor型調整 - feats = audio_pad + feats = audio16k.squeeze() if feats.dim() == 2: # double channels feats = feats.mean(-1) feats = feats.view(1, -1) + print("[EMBEDDER EXTRACT:audio:5:]", audio_t.shape) + + print("[EMBEDDER EXTRACT:::]", feats.shape) # embedding with autocast(enabled=self.isHalf): @@ -190,6 +217,7 @@ class Pipeline(object): try: with torch.no_grad(): with autocast(enabled=self.isHalf): + print("[EMBEDDER EXTRACT:::]", feats.shape, pitchf.unsqueeze(-1).shape, volume.shape, mask.shape) audio1 = ( torch.clip( self.inferencer.infer( @@ -222,5 +250,5 @@ class Pipeline(object): del pitch, pitchf, feats, sid torch.cuda.empty_cache() - + audio1 = self.resamplerOut(audio1.float()) return audio1, pitchf_buffer, feats_buffer diff --git a/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py b/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py index 59712004..b4ed7c3a 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py +++ b/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py @@ -7,8 +7,11 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import Pitc from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager +import torch +from torchaudio.transforms import Resample -def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str): + +def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str, inputSampleRate: int, outputSampleRate: int): dev = DeviceManager.get_instance().getDevice(gpu) # half = DeviceManager.get_instance().halfPrecisionAvailable(gpu) half = False @@ -35,6 +38,9 @@ def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str): # pitchExtractor pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu) + resamplerIn = Resample(inputSampleRate, 16000, dtype=torch.int16).to(dev) + resamplerOut = Resample(modelSlot.samplingRate, outputSampleRate, dtype=torch.int16).to(dev) + pipeline = Pipeline( embedder, inferencer, @@ -42,6 +48,8 @@ def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str): modelSlot.samplingRate, dev, half, + resamplerIn, + resamplerOut ) return pipeline diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 3b0bc7b0..3915338a 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -45,6 +45,7 @@ class VoiceChangerSettings: intData: list[str] = field( default_factory=lambda: [ "inputSampleRate", + "outputSampleRate", "crossFadeOverlapSize", "recordIO", ] diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 75c5d4ec..7ce2eebd 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -10,6 +10,7 @@ from voice_changer.ModelSlotManager import ModelSlotManager from voice_changer.RVC.RVCModelMerger import RVCModelMerger from voice_changer.VoiceChanger import VoiceChanger from const import STORED_SETTING_FILE, UPLOAD_DIR +from voice_changer.VoiceChangerV2 import VoiceChangerV2 from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams from voice_changer.utils.ModelMerger import MergeElement, ModelMergerRequest from voice_changer.utils.VoiceChangerModel import AudioInOut @@ -242,7 +243,7 @@ class VoiceChangerManager(ServerDeviceCallbacks): from voice_changer.DiffusionSVC.DiffusionSVC import DiffusionSVC self.voiceChangerModel = DiffusionSVC(self.params, slotInfo) - self.voiceChanger = VoiceChanger(self.params) + self.voiceChanger = VoiceChangerV2(self.params) self.voiceChanger.setModel(self.voiceChangerModel) else: print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}") diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py new file mode 100644 index 00000000..b5e5be4b --- /dev/null +++ b/server/voice_changer/VoiceChangerV2.py @@ -0,0 +1,331 @@ +''' +■ VoiceChangerV2 +- VoiceChangerとの差分 +・リサンプル処理の無駄を省くため、VoiceChangerModelにリサンプル処理を移譲 +・前処理、メイン処理の分割を廃止(VoiceChangeModelでの無駄な型変換などを回避するため) + +- 適用VoiceChangerModel +・DiffusionSVC + +''' + +from typing import Any, Union + +from const import TMP_DIR +import torch +import os +import traceback +import numpy as np +from dataclasses import dataclass, asdict, field +import onnxruntime + +from voice_changer.IORecorder import IORecorder + +from voice_changer.utils.Timer import Timer +from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel +from Exceptions import ( + DeviceCannotSupportHalfPrecisionException, + DeviceChangingException, + HalfPrecisionChangingException, + NoModeLoadedException, + NotEnoughDataExtimateF0, + ONNXInputArgumentException, + VoiceChangerIsNotSelectedException, +) +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams + +STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav") +STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav") + + +@dataclass +class VoiceChangerV2Settings: + inputSampleRate: int = 48000 # 48000 or 24000 + outputSampleRate: int = 48000 # 48000 or 24000 + + crossFadeOffsetRate: float = 0.1 + crossFadeEndRate: float = 0.9 + crossFadeOverlapSize: int = 4096 + + recordIO: int = 0 # 0:off, 1:on + + performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + + # ↓mutableな物だけ列挙 + intData: list[str] = field( + default_factory=lambda: [ + "inputSampleRate", + "outputSampleRate", + "crossFadeOverlapSize", + "recordIO", + ] + ) + floatData: list[str] = field( + default_factory=lambda: [ + "crossFadeOffsetRate", + "crossFadeEndRate", + ] + ) + strData: list[str] = field(default_factory=lambda: []) + + +class VoiceChangerV2: + ioRecorder: IORecorder + sola_buffer: AudioInOut + + def __init__(self, params: VoiceChangerParams): + # 初期化 + self.settings = VoiceChangerV2Settings() + self.currentCrossFadeOffsetRate = 0.0 + self.currentCrossFadeEndRate = 0.0 + self.currentCrossFadeOverlapSize = 0 # setting + self.crossfadeSize = 0 # calculated + + self.voiceChanger: VoiceChangerModel | None = None + self.params = params + self.gpu_num = torch.cuda.device_count() + self.prev_audio = np.zeros(4096) + self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() + self.onnx_device = onnxruntime.get_device() + + print(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})") + + def setModel(self, model: VoiceChangerModel): + self.voiceChanger = model + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + + def get_info(self): + data = asdict(self.settings) + if self.voiceChanger is not None: + data.update(self.voiceChanger.get_info()) + return data + + def get_performance(self): + return self.settings.performance + + def update_settings(self, key: str, val: Any): + if self.voiceChanger is None: + print("[Voice Changer] Voice Changer is not selected.") + return self.get_info() + + if key == "serverAudioStated" and val == 0: + self.settings.inputSampleRate = 48000 + self.settings.outputSampleRate = 48000 + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + + if key in self.settings.intData: + setattr(self.settings, key, int(val)) + if key == "crossFadeOffsetRate" or key == "crossFadeEndRate": + self.crossfadeSize = 0 + if key == "recordIO" and val == 1: + if hasattr(self, "ioRecorder"): + self.ioRecorder.close() + self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate, self.settings.outputSampleRate) + if key == "recordIO" and val == 0: + if hasattr(self, "ioRecorder"): + self.ioRecorder.close() + pass + if key == "recordIO" and val == 2: + if hasattr(self, "ioRecorder"): + self.ioRecorder.close() + if key == "inputSampleRate" or key == "outputSampleRate": + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + elif key in self.settings.floatData: + setattr(self.settings, key, float(val)) + elif key in self.settings.strData: + setattr(self.settings, key, str(val)) + else: + ret = self.voiceChanger.update_settings(key, val) + if ret is False: + pass + # print(f"({key} is not mutable variable or unknown variable)") + return self.get_info() + + def _generate_strength(self, crossfadeSize: int): + if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize: + self.crossfadeSize = crossfadeSize + self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate + self.currentCrossFadeEndRate = self.settings.crossFadeEndRate + self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize + + cf_offset = int(crossfadeSize * self.settings.crossFadeOffsetRate) + cf_end = int(crossfadeSize * self.settings.crossFadeEndRate) + cf_range = cf_end - cf_offset + percent = np.arange(cf_range) / cf_range + + np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2 + np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2 + + self.np_prev_strength = np.concatenate( + [ + np.ones(cf_offset), + np_prev_strength, + np.zeros(crossfadeSize - cf_offset - len(np_prev_strength)), + ] + ) + self.np_cur_strength = np.concatenate( + [ + np.zeros(cf_offset), + np_cur_strength, + np.ones(crossfadeSize - cf_offset - len(np_cur_strength)), + ] + ) + + print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}") + + # ひとつ前の結果とサイズが変わるため、記録は消去する。 + if hasattr(self, "np_prev_audio1") is True: + delattr(self, "np_prev_audio1") + if hasattr(self, "sola_buffer") is True: + del self.sola_buffer + + def get_processing_sampling_rate(self): + if self.voiceChanger is None: + return 0 + else: + return self.voiceChanger.get_processing_sampling_rate() + + # receivedData: tuple of short + def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: + try: + if self.voiceChanger is None: + raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.") + + with Timer("main-process") as t: + + processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() + sola_search_frame = int(0.012 * processing_sampling_rate) + block_frame = receivedData.shape[0] + crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) + self._generate_strength(crossfade_frame) + # data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame) + audio = self.voiceChanger.inference( + receivedData, + crossfade_frame=crossfade_frame, + sola_search_frame=sola_search_frame + ) + if hasattr(self, "sola_buffer") is True: + np.set_printoptions(threshold=10000) + audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame) + audio = audio[audio_offset:] + + # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI + cor_nom = np.convolve( + audio[: crossfade_frame + sola_search_frame], + np.flip(self.sola_buffer), + "valid", + ) + cor_den = np.sqrt( + np.convolve( + audio[: crossfade_frame + sola_search_frame] ** 2, + np.ones(crossfade_frame), + "valid", + ) + + 1e-3 + ) + sola_offset = int(np.argmax(cor_nom / cor_den)) + sola_end = sola_offset + block_frame + output_wav = audio[sola_offset:sola_end].astype(np.float64) + output_wav[:crossfade_frame] *= self.np_cur_strength + output_wav[:crossfade_frame] += self.sola_buffer[:] + + result = output_wav + else: + print("[Voice Changer] warming up... generating sola buffer.") + result = np.zeros(4096).astype(np.int16) + + if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame: + offset = -1 * (sola_search_frame + crossfade_frame - sola_offset) + end = -1 * (sola_search_frame - sola_offset) + sola_buf_org = audio[offset:end] + self.sola_buffer = sola_buf_org * self.np_prev_strength + else: + self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength + # self.sola_buffer = audio[- crossfade_frame:] + + mainprocess_time = t.secs + + # 後処理 + with Timer("post-process") as t: + result = result.astype(np.int16) + + print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz") + + if receivedData.shape[0] != result .shape[0]: + outputData = pad_array(result, receivedData.shape[0]) + pass + else: + outputData = result + + if self.settings.recordIO == 1: + self.ioRecorder.writeInput(receivedData) + self.ioRecorder.writeOutput(outputData.tobytes()) + + postprocess_time = t.secs + + print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") + perf = [0, mainprocess_time, postprocess_time] + + return outputData, perf + + except NoModeLoadedException as e: + print("[Voice Changer] [Exception]", e) + return np.zeros(1).astype(np.int16), [0, 0, 0] + except ONNXInputArgumentException as e: + print("[Voice Changer] [Exception] onnx are waiting valid input.", e) + return np.zeros(1).astype(np.int16), [0, 0, 0] + except HalfPrecisionChangingException: + print("[Voice Changer] Switching model configuration....") + return np.zeros(1).astype(np.int16), [0, 0, 0] + except NotEnoughDataExtimateF0: + print("[Voice Changer] warming up... waiting more data.") + return np.zeros(1).astype(np.int16), [0, 0, 0] + except DeviceChangingException as e: + print("[Voice Changer] embedder:", e) + return np.zeros(1).astype(np.int16), [0, 0, 0] + except VoiceChangerIsNotSelectedException: + print("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.") + return np.zeros(1).astype(np.int16), [0, 0, 0] + except DeviceCannotSupportHalfPrecisionException: + # RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。 + return np.zeros(1).astype(np.int16), [0, 0, 0] + except Exception as e: + print("[Voice Changer] VC PROCESSING EXCEPTION!!!", e) + print(traceback.format_exc()) + return np.zeros(1).astype(np.int16), [0, 0, 0] + + def export2onnx(self): + return self.voiceChanger.export2onnx() + + ############## + + def merge_models(self, request: str): + if self.voiceChanger is None: + print("[Voice Changer] Voice Changer is not selected.") + return + self.voiceChanger.merge_models(request) + return self.get_info() + + +PRINT_CONVERT_PROCESSING: bool = False +# PRINT_CONVERT_PROCESSING = True + + +def print_convert_processing(mess: str): + if PRINT_CONVERT_PROCESSING is True: + print(mess) + + +def pad_array(arr: AudioInOut, target_length: int): + current_length = arr.shape[0] + if current_length >= target_length: + return arr + else: + pad_width = target_length - current_length + pad_left = pad_width // 2 + pad_right = pad_width - pad_left + # padded_arr = np.pad( + # arr, (pad_left, pad_right), "constant", constant_values=(0, 0) + # ) + padded_arr = np.pad(arr, (pad_left, pad_right), "edge") + return padded_arr diff --git a/server/voice_changer/common/VolumeExtractor.py b/server/voice_changer/common/VolumeExtractor.py index 9f1d72b1..4ad0713c 100644 --- a/server/voice_changer/common/VolumeExtractor.py +++ b/server/voice_changer/common/VolumeExtractor.py @@ -2,6 +2,8 @@ import numpy as np import torch import torch.nn as nn +from voice_changer.utils.VoiceChangerModel import AudioInOut + class VolumeExtractor: @@ -13,7 +15,7 @@ class VolumeExtractor: "hop_size": self.hop_size } - def extract(self, audio): # audio: 1d numpy array + def extract(self, audio: torch.Tensor): audio = audio.squeeze().cpu() n_frames = int(len(audio) // self.hop_size) + 1 audio2 = audio ** 2 @@ -23,14 +25,40 @@ class VolumeExtractor: volume = np.sqrt(volume) return volume - def get_mask_from_volume(self, volume, block_size: int, threhold=-60.0, device='cpu') -> torch.Tensor: - mask = (volume > 10 ** (float(threhold) / 20)).astype('float') + def extract_t(self, audio: torch.Tensor): + with torch.no_grad(): + audio = audio.squeeze() + n_frames = int(audio.size(0) // self.hop_size) + 1 + audio2 = audio ** 2 + + audio2_frames = audio2.unfold(0, int(self.hop_size), int(self.hop_size)).contiguous() + + volume = torch.mean(audio2_frames, dim=-1) + volume = torch.sqrt(volume) + if volume.size(0) < n_frames: + volume = torch.nn.functional.pad(volume, (0, n_frames - volume.size(0)), 'constant', volume[-1]) + return volume + + def get_mask_from_volume(self, volume, block_size: int, threshold=-60.0, device='cpu') -> torch.Tensor: + volume = volume.cpu().numpy() + mask = (volume > 10 ** (float(threshold) / 20)).astype('float') mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1])) mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)]) mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0) mask = upsample(mask, block_size).squeeze(-1) return mask + def get_mask_from_volume_t(self, volume: torch.Tensor, block_size: int, threshold=-60.0, device='cpu') -> torch.Tensor: + volume = volume.squeeze() + mask = (volume > 10.0 ** (float(threshold) / 20)).float() + mask = nn.functional.pad(mask, (4, 0), 'constant', mask[0]) + mask = nn.functional.pad(mask, (0, 4), 'constant', mask[-1]) + mask = torch.max(mask.unfold(-1, 9, 1), -1)[0] + mask = mask.to(device).unsqueeze(-1).unsqueeze(0) + mask = upsample(mask, block_size).squeeze(-1) + print("[get_mask_from_volume_t 3]", mask.shape) + return mask + def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor: signal = signal.permute(0, 2, 1) diff --git a/server/voice_changer/utils/VoiceChangerModel.py b/server/voice_changer/utils/VoiceChangerModel.py index e28690ac..fbe40d6c 100644 --- a/server/voice_changer/utils/VoiceChangerModel.py +++ b/server/voice_changer/utils/VoiceChangerModel.py @@ -9,7 +9,6 @@ PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] - class VoiceChangerModel(Protocol): # loadModel: Callable[..., dict[str, Any]] def loadModel(self, params: LoadModelParams):