From 8a8386640d08fe808526f6480ab7813a1d371240 Mon Sep 17 00:00:00 2001
From: wataru <wataru@fdev.local.com>
Date: Fri, 16 Jun 2023 15:06:35 +0900
Subject: [PATCH] WIP:separate server device

---
 server/voice_changer/Local/AudioDeviceList.py |   4 +-
 server/voice_changer/Local/ServerDevice.py    | 190 ++++++++++++++++++
 server/voice_changer/VoiceChanger.py          | 157 +--------------
 server/voice_changer/VoiceChangerManager.py   |  31 ++-
 4 files changed, 228 insertions(+), 154 deletions(-)
 create mode 100644 server/voice_changer/Local/ServerDevice.py

diff --git a/server/voice_changer/Local/AudioDeviceList.py b/server/voice_changer/Local/AudioDeviceList.py
index 38d2186d..d0655e8d 100644
--- a/server/voice_changer/Local/AudioDeviceList.py
+++ b/server/voice_changer/Local/AudioDeviceList.py
@@ -25,8 +25,8 @@ def list_audio_device():
     # print("output:", outputDeviceList)
     # print("hostapis", hostapis)
 
-    serverAudioInputDevices = []
-    serverAudioOutputDevices = []
+    serverAudioInputDevices: list[ServerAudioDevice] = []
+    serverAudioOutputDevices: list[ServerAudioDevice] = []
     for d in inputAudioDeviceList:
         serverInputAudioDevice: ServerAudioDevice = ServerAudioDevice(
             kind=ServerAudioDeviceTypes.audioinput,
diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py
new file mode 100644
index 00000000..58b3434c
--- /dev/null
+++ b/server/voice_changer/Local/ServerDevice.py
@@ -0,0 +1,190 @@
+from dataclasses import dataclass, asdict
+
+import numpy as np
+
+from voice_changer.Local.AudioDeviceList import list_audio_device
+import time
+import sounddevice as sd
+from voice_changer.utils.Timer import Timer
+import librosa
+
+from voice_changer.utils.VoiceChangerModel import AudioInOut
+from typing import Protocol
+
+
+@dataclass
+class ServerDeviceSettings:
+    enableServerAudio: int = 0  # 0:off, 1:on
+    serverAudioStated: int = 0  # 0:off, 1:on
+    serverInputAudioSampleRate: int = 44100
+    serverOutputAudioSampleRate: int = 44100
+    serverInputDeviceId: int = -1
+    serverOutputDeviceId: int = -1
+    serverReadChunkSize: int = 256
+    serverInputAudioGain: float = 1.0
+    serverOutputAudioGain: float = 1.0
+
+
+EditableServerDeviceSettings = {
+    "intData": [
+        "enableServerAudio",
+        "serverAudioStated",
+        "serverInputAudioSampleRate",
+        "serverOutputAudioSampleRate",
+        "serverInputDeviceId",
+        "serverOutputDeviceId",
+        "serverReadChunkSize",
+    ],
+    "floatData": [
+        "serverInputAudioGain",
+        "serverOutputAudioGain",
+    ],
+}
+
+
+class ServerDeviceCallbacks(Protocol):
+    def on_request(self, unpackedData: AudioInOut):
+        ...
+
+    def emitTo(self, performance: list[float]):
+        ...
+
+    def get_processing_sampling_rate(self):
+        ...
+
+    def setSamplingRate(self, sr: int):
+        ...
+
+
+class ServerDevice:
+    def __init__(self, serverDeviceCallbacks: ServerDeviceCallbacks):
+        self.settings = ServerDeviceSettings()
+        self.serverDeviceCallbacks = serverDeviceCallbacks
+
+    def getServerInputAudioDevice(self, index: int):
+        audioinput, _audiooutput = list_audio_device()
+        serverAudioDevice = [x for x in audioinput if x.index == index]
+        if len(serverAudioDevice) > 0:
+            return serverAudioDevice[0]
+        else:
+            return None
+
+    def getServerOutputAudioDevice(self, index: int):
+        _audioinput, audiooutput = list_audio_device()
+        serverAudioDevice = [x for x in audiooutput if x.index == index]
+        if len(serverAudioDevice) > 0:
+            return serverAudioDevice[0]
+        else:
+            return None
+
+    def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
+        try:
+            indata = indata * self.settings.serverInputAudioGain
+            with Timer("all_inference_time") as t:
+                unpackedData = librosa.to_mono(indata.T) * 32768.0
+                out_wav, times = self.serverDeviceCallbacks.on_request(unpackedData)
+                outputChunnels = outdata.shape[1]
+                outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
+                outdata[:] = outdata * self.settings.serverOutputAudioGain
+            all_inference_time = t.secs
+            self.performance = [all_inference_time] + times
+            self.serverDeviceCallbacks.emitTo(self.performance)
+            self.performance = [round(x * 1000) for x in self.performance]
+        except Exception as e:
+            print("[Voice Changer] ex:", e)
+
+    def start(self):
+        # currentInputDeviceId = -1
+        # currentOutputDeviceId = -1
+        # currentInputChunkNum = -1
+        currentModelSamplingRate = -1
+        while True:
+            if self.settings.serverAudioStated == 0 or self.settings.serverInputDeviceId == -1:
+                # self.settings.inputSampleRate = 48000
+                time.sleep(2)
+            else:
+                sd._terminate()
+                sd._initialize()
+
+                sd.default.device[0] = self.settings.serverInputDeviceId
+                # currentInputDeviceId = self.settings.serverInputDeviceId
+                sd.default.device[1] = self.settings.serverOutputDeviceId
+                # currentOutputDeviceId = self.settings.serverOutputDeviceId
+
+                serverInputAudioDevice = self.getServerInputAudioDevice(sd.default.device[0])
+                serverOutputAudioDevice = self.getServerOutputAudioDevice(sd.default.device[1])
+                print(serverInputAudioDevice, serverOutputAudioDevice)
+                if serverInputAudioDevice is None or serverOutputAudioDevice is None:
+                    time.sleep(2)
+                    print("serverInputAudioDevice or serverOutputAudioDevice is None")
+                    continue
+
+                # currentInputChannelNum = serverInputAudioDevice.maxInputChannels
+                # currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
+                sd.default.channels[0] = serverInputAudioDevice.maxInputChannels
+                sd.default.channels[1] = serverOutputAudioDevice.maxOutputChannels
+
+                currentInputChunkNum = self.settings.serverReadChunkSize
+                block_frame = currentInputChunkNum * 128
+
+                # sample rate precheck(alsa cannot use 40000?)
+                try:
+                    currentModelSamplingRate = self.serverDeviceCallbacks.get_processing_sampling_rate()
+                except Exception as e:
+                    print("[Voice Changer] ex: get_processing_sampling_rate", e)
+                    continue
+                try:
+                    with sd.Stream(
+                        callback=self.audio_callback,
+                        blocksize=block_frame,
+                        # samplerate=currentModelSamplingRate,
+                        dtype="float32",
+                        # channels=[currentInputChannelNum, currentOutputChannelNum],
+                    ):
+                        pass
+                    self.settings.serverInputAudioSampleRate = currentModelSamplingRate
+                    self.serverDeviceCallbacks.setSamplingRate(currentModelSamplingRate)
+                    print(f"[Voice Changer] sample rate {self.settings.serverInputAudioSampleRate}")
+                except Exception as e:
+                    print("[Voice Changer] ex: fallback to device default samplerate", e)
+                    print("[Voice Changer] device default samplerate", serverInputAudioDevice.default_samplerate)
+                    self.settings.serverInputAudioSampleRate = round(serverInputAudioDevice.default_samplerate)
+                    self.serverDeviceCallbacks.setSamplingRate(round(serverInputAudioDevice.default_samplerate))
+
+                sd.default.samplerate = self.settings.serverInputAudioSampleRate
+                sd.default.blocksize = block_frame
+                # main loop
+                try:
+                    with sd.Stream(
+                        callback=self.audio_callback,
+                        # blocksize=block_frame,
+                        # samplerate=vc.settings.serverInputAudioSampleRate,
+                        dtype="float32",
+                        # channels=[currentInputChannelNum, currentOutputChannelNum],
+                    ):
+                        while self.settings.serverAudioStated == 1 and sd.default.device[0] == self.settings.serverInputDeviceId and sd.default.device[1] == self.settings.serverOutputDeviceId and currentModelSamplingRate == self.serverDeviceCallbacks.get_processing_sampling_rate() and currentInputChunkNum == self.settings.serverReadChunkSize:
+                            time.sleep(2)
+                            print(
+                                "[Voice Changer] server audio",
+                                self.performance,
+                            )
+                            print(f"[Voice Changer] started:{self.settings.serverAudioStated}, input:{sd.default.device[0]}, output:{sd.default.device[1]}, mic_sr:{self.settings.serverInputAudioSampleRate}, model_sr:{currentModelSamplingRate}, chunk:{currentInputChunkNum}, ch:[{sd.default.channels}]")
+
+                except Exception as e:
+                    print("[Voice Changer] ex:", e)
+                    time.sleep(2)
+
+    def get_info(self):
+        data = asdict(self.settings)
+        audioinput, audiooutput = list_audio_device()
+        data["serverAudioInputDevices"] = audioinput
+        data["serverAudioOutputDevices"] = audiooutput
+
+        return data
+
+    def update_settings(self, key: str, val: str | int | float):
+        if key in EditableServerDeviceSettings["intData"]:
+            setattr(self.settings, key, int(val))
+        elif key in EditableServerDeviceSettings["floatData"]:
+            setattr(self.settings, key, float(val))
+        return self.get_info()
diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py
index 7277b44d..4877b6c9 100755
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@@ -11,7 +11,6 @@ import resampy
 
 
 from voice_changer.IORecorder import IORecorder
-from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device
 from voice_changer.utils.LoadModelParams import LoadModelParams
 
 from voice_changer.utils.Timer import Timer
@@ -26,10 +25,6 @@ from Exceptions import (
     VoiceChangerIsNotSelectedException,
 )
 from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
-import threading
-import time
-import sounddevice as sd
-import librosa
 
 STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
 STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
@@ -44,22 +39,7 @@ class VoiceChangerSettings:
     crossFadeOverlapSize: int = 4096
 
     recordIO: int = 0  # 0:off, 1:on
-    serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
-    serverAudioOutputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
 
-    enableServerAudio: int = 0  # 0:off, 1:on
-    serverAudioStated: int = 0  # 0:off, 1:on
-    # serverInputAudioSampleRate: int = 48000
-    # serverOutputAudioSampleRate: int = 48000
-    serverInputAudioSampleRate: int = 44100
-    serverOutputAudioSampleRate: int = 44100
-    # serverInputAudioBufferSize: int = 1024 * 24
-    # serverOutputAudioBufferSize: int = 1024 * 24
-    serverInputDeviceId: int = -1
-    serverOutputDeviceId: int = -1
-    serverReadChunkSize: int = 256
-    serverInputAudioGain: float = 1.0
-    serverOutputAudioGain: float = 1.0
     performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
 
     # ↓mutableな物だけ列挙
@@ -68,23 +48,12 @@ class VoiceChangerSettings:
             "inputSampleRate",
             "crossFadeOverlapSize",
             "recordIO",
-            "enableServerAudio",
-            "serverAudioStated",
-            "serverInputAudioSampleRate",
-            "serverOutputAudioSampleRate",
-            # "serverInputAudioBufferSize",
-            # "serverOutputAudioBufferSize",
-            "serverInputDeviceId",
-            "serverOutputDeviceId",
-            "serverReadChunkSize",
         ]
     )
     floatData: list[str] = field(
         default_factory=lambda: [
             "crossFadeOffsetRate",
             "crossFadeEndRate",
-            "serverInputAudioGain",
-            "serverOutputAudioGain",
         ]
     )
     strData: list[str] = field(default_factory=lambda: [])
@@ -101,120 +70,6 @@ class VoiceChanger:
 
     emitTo = None
 
-    def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
-        try:
-            indata = indata * self.settings.serverInputAudioGain
-            with Timer("all_inference_time") as t:
-                unpackedData = librosa.to_mono(indata.T) * 32768.0
-                out_wav, times = self.on_request(unpackedData)
-                outputChunnels = outdata.shape[1]
-                outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
-                outdata[:] = outdata * self.settings.serverOutputAudioGain
-            all_inference_time = t.secs
-            performance = [all_inference_time] + times
-            if self.emitTo is not None:
-                self.emitTo(performance)
-            self.settings.performance = [round(x * 1000) for x in performance]
-        except Exception as e:
-            print("[Voice Changer] ex:", e)
-
-    def getServerAudioDevice(self, audioDeviceList: list[ServerAudioDevice], index: int):
-        serverAudioDevice = [x for x in audioDeviceList if x.index == index]
-        if len(serverAudioDevice) > 0:
-            return serverAudioDevice[0]
-        else:
-            return None
-
-    def serverLocal(self, _vc):
-        vc: VoiceChanger = _vc
-
-        currentInputDeviceId = -1
-        currentModelSamplingRate = -1
-        currentOutputDeviceId = -1
-        currentInputChunkNum = -1
-        while True:
-            if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc.voiceChanger is None:
-                vc.settings.inputSampleRate = 48000
-                time.sleep(2)
-            else:
-                sd._terminate()
-                sd._initialize()
-
-                sd.default.device[0] = vc.settings.serverInputDeviceId
-                currentInputDeviceId = vc.settings.serverInputDeviceId
-                sd.default.device[1] = vc.settings.serverOutputDeviceId
-                currentOutputDeviceId = vc.settings.serverOutputDeviceId
-
-                currentInputChannelNum = vc.settings.serverAudioInputDevices
-
-                serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
-                serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
-                print(serverInputAudioDevice, serverOutputAudioDevice)
-                if serverInputAudioDevice is None or serverOutputAudioDevice is None:
-                    time.sleep(2)
-                    print("serverInputAudioDevice or serverOutputAudioDevice is None")
-                    continue
-
-                currentInputChannelNum = serverInputAudioDevice.maxInputChannels
-                currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
-
-                currentInputChunkNum = vc.settings.serverReadChunkSize
-                block_frame = currentInputChunkNum * 128
-
-                # sample rate precheck(alsa cannot use 40000?)
-                try:
-                    currentModelSamplingRate = self.voiceChanger.get_processing_sampling_rate()
-                except Exception as e:
-                    print("[Voice Changer] ex: get_processing_sampling_rate", e)
-                    continue
-                try:
-                    with sd.Stream(
-                        callback=self.audio_callback,
-                        blocksize=block_frame,
-                        samplerate=currentModelSamplingRate,
-                        dtype="float32",
-                        channels=[currentInputChannelNum, currentOutputChannelNum],
-                    ):
-                        pass
-                    vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
-                    vc.settings.inputSampleRate = currentModelSamplingRate
-                    print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}")
-                except Exception as e:
-                    print(
-                        "[Voice Changer] ex: fallback to device default samplerate",
-                        e,
-                    )
-                    vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
-                    vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
-
-                # main loop
-                try:
-                    with sd.Stream(
-                        callback=self.audio_callback,
-                        blocksize=block_frame,
-                        samplerate=vc.settings.serverInputAudioSampleRate,
-                        dtype="float32",
-                        channels=[currentInputChannelNum, currentOutputChannelNum],
-                    ):
-                        while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize:
-                            time.sleep(2)
-                            print(
-                                "[Voice Changer] server audio",
-                                self.settings.performance,
-                            )
-                            print(
-                                "[Voice Changer] info:",
-                                vc.settings.serverAudioStated,
-                                currentInputDeviceId,
-                                currentOutputDeviceId,
-                                vc.settings.serverInputAudioSampleRate,
-                                currentInputChunkNum,
-                            )
-
-                except Exception as e:
-                    print("[Voice Changer] ex:", e)
-                    time.sleep(2)
-
     def __init__(self, params: VoiceChangerParams):
         # 初期化
         self.settings = VoiceChangerSettings()
@@ -231,12 +86,6 @@ class VoiceChanger:
         self.prev_audio = np.zeros(4096)
         self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
 
-        audioinput, audiooutput = list_audio_device()
-        self.settings.serverAudioInputDevices = audioinput
-        self.settings.serverAudioOutputDevices = audiooutput
-
-        thread = threading.Thread(target=self.serverLocal, args=(self,))
-        thread.start()
         print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
 
     def switchModelType(self, modelType: ModelType):
@@ -375,6 +224,12 @@ class VoiceChanger:
             if hasattr(self, "sola_buffer") is True:
                 del self.sola_buffer
 
+    def get_processing_sampling_rate(self):
+        if self.voiceChanger is None:
+            return 0
+        else:
+            return self.voiceChanger.get_processing_sampling_rate()
+
     #  receivedData: tuple of short
     def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
         return self.on_request_sola(receivedData)
diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py
index cc4b7d57..322332f4 100644
--- a/server/voice_changer/VoiceChangerManager.py
+++ b/server/voice_changer/VoiceChangerManager.py
@@ -1,4 +1,5 @@
 import numpy as np
+from voice_changer.Local.ServerDevice import ServerDevice, ServerDeviceCallbacks
 from voice_changer.VoiceChanger import VoiceChanger
 from const import ModelType
 from voice_changer.utils.LoadModelParams import LoadModelParams
@@ -6,6 +7,7 @@ from voice_changer.utils.VoiceChangerModel import AudioInOut
 from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
 from dataclasses import dataclass, asdict
 import torch
+import threading
 
 
 @dataclass()
@@ -22,15 +24,38 @@ class VoiceChangerManagerSettings:
     # intData: list[str] = field(default_factory=lambda: ["slotIndex"])
 
 
-class VoiceChangerManager(object):
+class VoiceChangerManager(ServerDeviceCallbacks):
     _instance = None
 
+    ############################
+    # ServerDeviceCallbacks
+    ############################
+    def on_request(self, unpackedData: AudioInOut):
+        return self.changeVoice(unpackedData)
+
+    def emitTo(self, performance: list[float]):
+        print("emit ", performance)
+
+    def get_processing_sampling_rate(self):
+        return self.voiceChanger.get_processing_sampling_rate()
+
+    def setSamplingRate(self, sr: int):
+        self.voiceChanger.settings.inputSampleRate = sr
+
+    ############################
+    # VoiceChangerManager
+    ############################
     def __init__(self, params: VoiceChangerParams):
         self.voiceChanger: VoiceChanger = None
         self.settings: VoiceChangerManagerSettings = VoiceChangerManagerSettings(dummy=0)
         # スタティックな情報を収集
         self.gpus: list[GPUInfo] = self._get_gpuInfos()
 
+        self.serverDevice = ServerDevice(self)
+
+        thread = threading.Thread(target=self.serverDevice.start, args=())
+        thread.start()
+
     def _get_gpuInfos(self):
         devCount = torch.cuda.device_count()
         gpus = []
@@ -62,6 +87,9 @@ class VoiceChangerManager(object):
 
         data["status"] = "OK"
 
+        info = self.serverDevice.get_info()
+        data.update(info)
+
         if hasattr(self, "voiceChanger"):
             info = self.voiceChanger.get_info()
             data.update(info)
@@ -77,6 +105,7 @@ class VoiceChangerManager(object):
             return {"status": "ERROR", "msg": "no model loaded"}
 
     def update_settings(self, key: str, val: str | int | float):
+        self.serverDevice.update_settings(key, val)
         if hasattr(self, "voiceChanger"):
             self.voiceChanger.update_settings(key, val)
         else: