From 8a8386640d08fe808526f6480ab7813a1d371240 Mon Sep 17 00:00:00 2001 From: wataru Date: Fri, 16 Jun 2023 15:06:35 +0900 Subject: [PATCH] WIP:separate server device --- server/voice_changer/Local/AudioDeviceList.py | 4 +- server/voice_changer/Local/ServerDevice.py | 190 ++++++++++++++++++ server/voice_changer/VoiceChanger.py | 157 +-------------- server/voice_changer/VoiceChangerManager.py | 31 ++- 4 files changed, 228 insertions(+), 154 deletions(-) create mode 100644 server/voice_changer/Local/ServerDevice.py diff --git a/server/voice_changer/Local/AudioDeviceList.py b/server/voice_changer/Local/AudioDeviceList.py index 38d2186d..d0655e8d 100644 --- a/server/voice_changer/Local/AudioDeviceList.py +++ b/server/voice_changer/Local/AudioDeviceList.py @@ -25,8 +25,8 @@ def list_audio_device(): # print("output:", outputDeviceList) # print("hostapis", hostapis) - serverAudioInputDevices = [] - serverAudioOutputDevices = [] + serverAudioInputDevices: list[ServerAudioDevice] = [] + serverAudioOutputDevices: list[ServerAudioDevice] = [] for d in inputAudioDeviceList: serverInputAudioDevice: ServerAudioDevice = ServerAudioDevice( kind=ServerAudioDeviceTypes.audioinput, diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py new file mode 100644 index 00000000..58b3434c --- /dev/null +++ b/server/voice_changer/Local/ServerDevice.py @@ -0,0 +1,190 @@ +from dataclasses import dataclass, asdict + +import numpy as np + +from voice_changer.Local.AudioDeviceList import list_audio_device +import time +import sounddevice as sd +from voice_changer.utils.Timer import Timer +import librosa + +from voice_changer.utils.VoiceChangerModel import AudioInOut +from typing import Protocol + + +@dataclass +class ServerDeviceSettings: + enableServerAudio: int = 0 # 0:off, 1:on + serverAudioStated: int = 0 # 0:off, 1:on + serverInputAudioSampleRate: int = 44100 + serverOutputAudioSampleRate: int = 44100 + serverInputDeviceId: int = -1 + serverOutputDeviceId: int = -1 + serverReadChunkSize: int = 256 + serverInputAudioGain: float = 1.0 + serverOutputAudioGain: float = 1.0 + + +EditableServerDeviceSettings = { + "intData": [ + "enableServerAudio", + "serverAudioStated", + "serverInputAudioSampleRate", + "serverOutputAudioSampleRate", + "serverInputDeviceId", + "serverOutputDeviceId", + "serverReadChunkSize", + ], + "floatData": [ + "serverInputAudioGain", + "serverOutputAudioGain", + ], +} + + +class ServerDeviceCallbacks(Protocol): + def on_request(self, unpackedData: AudioInOut): + ... + + def emitTo(self, performance: list[float]): + ... + + def get_processing_sampling_rate(self): + ... + + def setSamplingRate(self, sr: int): + ... + + +class ServerDevice: + def __init__(self, serverDeviceCallbacks: ServerDeviceCallbacks): + self.settings = ServerDeviceSettings() + self.serverDeviceCallbacks = serverDeviceCallbacks + + def getServerInputAudioDevice(self, index: int): + audioinput, _audiooutput = list_audio_device() + serverAudioDevice = [x for x in audioinput if x.index == index] + if len(serverAudioDevice) > 0: + return serverAudioDevice[0] + else: + return None + + def getServerOutputAudioDevice(self, index: int): + _audioinput, audiooutput = list_audio_device() + serverAudioDevice = [x for x in audiooutput if x.index == index] + if len(serverAudioDevice) > 0: + return serverAudioDevice[0] + else: + return None + + def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status): + try: + indata = indata * self.settings.serverInputAudioGain + with Timer("all_inference_time") as t: + unpackedData = librosa.to_mono(indata.T) * 32768.0 + out_wav, times = self.serverDeviceCallbacks.on_request(unpackedData) + outputChunnels = outdata.shape[1] + outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0 + outdata[:] = outdata * self.settings.serverOutputAudioGain + all_inference_time = t.secs + self.performance = [all_inference_time] + times + self.serverDeviceCallbacks.emitTo(self.performance) + self.performance = [round(x * 1000) for x in self.performance] + except Exception as e: + print("[Voice Changer] ex:", e) + + def start(self): + # currentInputDeviceId = -1 + # currentOutputDeviceId = -1 + # currentInputChunkNum = -1 + currentModelSamplingRate = -1 + while True: + if self.settings.serverAudioStated == 0 or self.settings.serverInputDeviceId == -1: + # self.settings.inputSampleRate = 48000 + time.sleep(2) + else: + sd._terminate() + sd._initialize() + + sd.default.device[0] = self.settings.serverInputDeviceId + # currentInputDeviceId = self.settings.serverInputDeviceId + sd.default.device[1] = self.settings.serverOutputDeviceId + # currentOutputDeviceId = self.settings.serverOutputDeviceId + + serverInputAudioDevice = self.getServerInputAudioDevice(sd.default.device[0]) + serverOutputAudioDevice = self.getServerOutputAudioDevice(sd.default.device[1]) + print(serverInputAudioDevice, serverOutputAudioDevice) + if serverInputAudioDevice is None or serverOutputAudioDevice is None: + time.sleep(2) + print("serverInputAudioDevice or serverOutputAudioDevice is None") + continue + + # currentInputChannelNum = serverInputAudioDevice.maxInputChannels + # currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels + sd.default.channels[0] = serverInputAudioDevice.maxInputChannels + sd.default.channels[1] = serverOutputAudioDevice.maxOutputChannels + + currentInputChunkNum = self.settings.serverReadChunkSize + block_frame = currentInputChunkNum * 128 + + # sample rate precheck(alsa cannot use 40000?) + try: + currentModelSamplingRate = self.serverDeviceCallbacks.get_processing_sampling_rate() + except Exception as e: + print("[Voice Changer] ex: get_processing_sampling_rate", e) + continue + try: + with sd.Stream( + callback=self.audio_callback, + blocksize=block_frame, + # samplerate=currentModelSamplingRate, + dtype="float32", + # channels=[currentInputChannelNum, currentOutputChannelNum], + ): + pass + self.settings.serverInputAudioSampleRate = currentModelSamplingRate + self.serverDeviceCallbacks.setSamplingRate(currentModelSamplingRate) + print(f"[Voice Changer] sample rate {self.settings.serverInputAudioSampleRate}") + except Exception as e: + print("[Voice Changer] ex: fallback to device default samplerate", e) + print("[Voice Changer] device default samplerate", serverInputAudioDevice.default_samplerate) + self.settings.serverInputAudioSampleRate = round(serverInputAudioDevice.default_samplerate) + self.serverDeviceCallbacks.setSamplingRate(round(serverInputAudioDevice.default_samplerate)) + + sd.default.samplerate = self.settings.serverInputAudioSampleRate + sd.default.blocksize = block_frame + # main loop + try: + with sd.Stream( + callback=self.audio_callback, + # blocksize=block_frame, + # samplerate=vc.settings.serverInputAudioSampleRate, + dtype="float32", + # channels=[currentInputChannelNum, currentOutputChannelNum], + ): + while self.settings.serverAudioStated == 1 and sd.default.device[0] == self.settings.serverInputDeviceId and sd.default.device[1] == self.settings.serverOutputDeviceId and currentModelSamplingRate == self.serverDeviceCallbacks.get_processing_sampling_rate() and currentInputChunkNum == self.settings.serverReadChunkSize: + time.sleep(2) + print( + "[Voice Changer] server audio", + self.performance, + ) + print(f"[Voice Changer] started:{self.settings.serverAudioStated}, input:{sd.default.device[0]}, output:{sd.default.device[1]}, mic_sr:{self.settings.serverInputAudioSampleRate}, model_sr:{currentModelSamplingRate}, chunk:{currentInputChunkNum}, ch:[{sd.default.channels}]") + + except Exception as e: + print("[Voice Changer] ex:", e) + time.sleep(2) + + def get_info(self): + data = asdict(self.settings) + audioinput, audiooutput = list_audio_device() + data["serverAudioInputDevices"] = audioinput + data["serverAudioOutputDevices"] = audiooutput + + return data + + def update_settings(self, key: str, val: str | int | float): + if key in EditableServerDeviceSettings["intData"]: + setattr(self.settings, key, int(val)) + elif key in EditableServerDeviceSettings["floatData"]: + setattr(self.settings, key, float(val)) + return self.get_info() diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 7277b44d..4877b6c9 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -11,7 +11,6 @@ import resampy from voice_changer.IORecorder import IORecorder -from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device from voice_changer.utils.LoadModelParams import LoadModelParams from voice_changer.utils.Timer import Timer @@ -26,10 +25,6 @@ from Exceptions import ( VoiceChangerIsNotSelectedException, ) from voice_changer.utils.VoiceChangerParams import VoiceChangerParams -import threading -import time -import sounddevice as sd -import librosa STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav") STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav") @@ -44,22 +39,7 @@ class VoiceChangerSettings: crossFadeOverlapSize: int = 4096 recordIO: int = 0 # 0:off, 1:on - serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: []) - serverAudioOutputDevices: list[ServerAudioDevice] = field(default_factory=lambda: []) - enableServerAudio: int = 0 # 0:off, 1:on - serverAudioStated: int = 0 # 0:off, 1:on - # serverInputAudioSampleRate: int = 48000 - # serverOutputAudioSampleRate: int = 48000 - serverInputAudioSampleRate: int = 44100 - serverOutputAudioSampleRate: int = 44100 - # serverInputAudioBufferSize: int = 1024 * 24 - # serverOutputAudioBufferSize: int = 1024 * 24 - serverInputDeviceId: int = -1 - serverOutputDeviceId: int = -1 - serverReadChunkSize: int = 256 - serverInputAudioGain: float = 1.0 - serverOutputAudioGain: float = 1.0 performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) # ↓mutableな物だけ列挙 @@ -68,23 +48,12 @@ class VoiceChangerSettings: "inputSampleRate", "crossFadeOverlapSize", "recordIO", - "enableServerAudio", - "serverAudioStated", - "serverInputAudioSampleRate", - "serverOutputAudioSampleRate", - # "serverInputAudioBufferSize", - # "serverOutputAudioBufferSize", - "serverInputDeviceId", - "serverOutputDeviceId", - "serverReadChunkSize", ] ) floatData: list[str] = field( default_factory=lambda: [ "crossFadeOffsetRate", "crossFadeEndRate", - "serverInputAudioGain", - "serverOutputAudioGain", ] ) strData: list[str] = field(default_factory=lambda: []) @@ -101,120 +70,6 @@ class VoiceChanger: emitTo = None - def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status): - try: - indata = indata * self.settings.serverInputAudioGain - with Timer("all_inference_time") as t: - unpackedData = librosa.to_mono(indata.T) * 32768.0 - out_wav, times = self.on_request(unpackedData) - outputChunnels = outdata.shape[1] - outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0 - outdata[:] = outdata * self.settings.serverOutputAudioGain - all_inference_time = t.secs - performance = [all_inference_time] + times - if self.emitTo is not None: - self.emitTo(performance) - self.settings.performance = [round(x * 1000) for x in performance] - except Exception as e: - print("[Voice Changer] ex:", e) - - def getServerAudioDevice(self, audioDeviceList: list[ServerAudioDevice], index: int): - serverAudioDevice = [x for x in audioDeviceList if x.index == index] - if len(serverAudioDevice) > 0: - return serverAudioDevice[0] - else: - return None - - def serverLocal(self, _vc): - vc: VoiceChanger = _vc - - currentInputDeviceId = -1 - currentModelSamplingRate = -1 - currentOutputDeviceId = -1 - currentInputChunkNum = -1 - while True: - if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc.voiceChanger is None: - vc.settings.inputSampleRate = 48000 - time.sleep(2) - else: - sd._terminate() - sd._initialize() - - sd.default.device[0] = vc.settings.serverInputDeviceId - currentInputDeviceId = vc.settings.serverInputDeviceId - sd.default.device[1] = vc.settings.serverOutputDeviceId - currentOutputDeviceId = vc.settings.serverOutputDeviceId - - currentInputChannelNum = vc.settings.serverAudioInputDevices - - serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId) - serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId) - print(serverInputAudioDevice, serverOutputAudioDevice) - if serverInputAudioDevice is None or serverOutputAudioDevice is None: - time.sleep(2) - print("serverInputAudioDevice or serverOutputAudioDevice is None") - continue - - currentInputChannelNum = serverInputAudioDevice.maxInputChannels - currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels - - currentInputChunkNum = vc.settings.serverReadChunkSize - block_frame = currentInputChunkNum * 128 - - # sample rate precheck(alsa cannot use 40000?) - try: - currentModelSamplingRate = self.voiceChanger.get_processing_sampling_rate() - except Exception as e: - print("[Voice Changer] ex: get_processing_sampling_rate", e) - continue - try: - with sd.Stream( - callback=self.audio_callback, - blocksize=block_frame, - samplerate=currentModelSamplingRate, - dtype="float32", - channels=[currentInputChannelNum, currentOutputChannelNum], - ): - pass - vc.settings.serverInputAudioSampleRate = currentModelSamplingRate - vc.settings.inputSampleRate = currentModelSamplingRate - print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}") - except Exception as e: - print( - "[Voice Changer] ex: fallback to device default samplerate", - e, - ) - vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate - vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate - - # main loop - try: - with sd.Stream( - callback=self.audio_callback, - blocksize=block_frame, - samplerate=vc.settings.serverInputAudioSampleRate, - dtype="float32", - channels=[currentInputChannelNum, currentOutputChannelNum], - ): - while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize: - time.sleep(2) - print( - "[Voice Changer] server audio", - self.settings.performance, - ) - print( - "[Voice Changer] info:", - vc.settings.serverAudioStated, - currentInputDeviceId, - currentOutputDeviceId, - vc.settings.serverInputAudioSampleRate, - currentInputChunkNum, - ) - - except Exception as e: - print("[Voice Changer] ex:", e) - time.sleep(2) - def __init__(self, params: VoiceChangerParams): # 初期化 self.settings = VoiceChangerSettings() @@ -231,12 +86,6 @@ class VoiceChanger: self.prev_audio = np.zeros(4096) self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() - audioinput, audiooutput = list_audio_device() - self.settings.serverAudioInputDevices = audioinput - self.settings.serverAudioOutputDevices = audiooutput - - thread = threading.Thread(target=self.serverLocal, args=(self,)) - thread.start() print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})") def switchModelType(self, modelType: ModelType): @@ -375,6 +224,12 @@ class VoiceChanger: if hasattr(self, "sola_buffer") is True: del self.sola_buffer + def get_processing_sampling_rate(self): + if self.voiceChanger is None: + return 0 + else: + return self.voiceChanger.get_processing_sampling_rate() + # receivedData: tuple of short def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: return self.on_request_sola(receivedData) diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index cc4b7d57..322332f4 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -1,4 +1,5 @@ import numpy as np +from voice_changer.Local.ServerDevice import ServerDevice, ServerDeviceCallbacks from voice_changer.VoiceChanger import VoiceChanger from const import ModelType from voice_changer.utils.LoadModelParams import LoadModelParams @@ -6,6 +7,7 @@ from voice_changer.utils.VoiceChangerModel import AudioInOut from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from dataclasses import dataclass, asdict import torch +import threading @dataclass() @@ -22,15 +24,38 @@ class VoiceChangerManagerSettings: # intData: list[str] = field(default_factory=lambda: ["slotIndex"]) -class VoiceChangerManager(object): +class VoiceChangerManager(ServerDeviceCallbacks): _instance = None + ############################ + # ServerDeviceCallbacks + ############################ + def on_request(self, unpackedData: AudioInOut): + return self.changeVoice(unpackedData) + + def emitTo(self, performance: list[float]): + print("emit ", performance) + + def get_processing_sampling_rate(self): + return self.voiceChanger.get_processing_sampling_rate() + + def setSamplingRate(self, sr: int): + self.voiceChanger.settings.inputSampleRate = sr + + ############################ + # VoiceChangerManager + ############################ def __init__(self, params: VoiceChangerParams): self.voiceChanger: VoiceChanger = None self.settings: VoiceChangerManagerSettings = VoiceChangerManagerSettings(dummy=0) # スタティックな情報を収集 self.gpus: list[GPUInfo] = self._get_gpuInfos() + self.serverDevice = ServerDevice(self) + + thread = threading.Thread(target=self.serverDevice.start, args=()) + thread.start() + def _get_gpuInfos(self): devCount = torch.cuda.device_count() gpus = [] @@ -62,6 +87,9 @@ class VoiceChangerManager(object): data["status"] = "OK" + info = self.serverDevice.get_info() + data.update(info) + if hasattr(self, "voiceChanger"): info = self.voiceChanger.get_info() data.update(info) @@ -77,6 +105,7 @@ class VoiceChangerManager(object): return {"status": "ERROR", "msg": "no model loaded"} def update_settings(self, key: str, val: str | int | float): + self.serverDevice.update_settings(key, val) if hasattr(self, "voiceChanger"): self.voiceChanger.update_settings(key, val) else: