WIP: integrate vcs to new gui 2

2023-06-20 06:39:39 +09:00 · 2023-06-20 06:39:39 +09:00 · b6996a15fe
commit b6996a15fe
parent b453e5fd85
12 changed files with 251 additions and 153 deletions
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/src/components/demo/904-3_FileUploader.tsx
+++ b/client/demo/src/components/demo/904-3_FileUploader.tsx
@ -1,6 +1,6 @@
 import React, { useEffect, useMemo, useState } from "react";
 import { useAppState } from "../../001_provider/001_AppStateProvider";
-import { FileUploadSetting, InitialFileUploadSetting, ModelFileKind, ModelUploadSetting, VoiceChangerType, fileSelector } from "@dannadori/voice-changer-client-js";
+import { ModelFileKind, ModelUploadSetting, VoiceChangerType, fileSelector } from "@dannadori/voice-changer-client-js";
 import { useMessageBuilder } from "../../hooks/useMessageBuilder";
 import { ModelSlotManagerDialogScreen } from "./904_ModelSlotManagerDialog";
 import { checkExtention, trimfileName } from "../../utils/utils";
@ -54,15 +54,31 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
        const checkModelSetting = (setting: ModelUploadSetting) => {
            if (setting.voiceChangerType == "RVC") {
                // const enough = !!setting.files.find(x => { return x.kind == "rvcModel" }) &&
                //     !!setting.files.find(x => { return x.kind == "rvcIndex" })
                // return enough
                const enough = !!setting.files.find(x => { return x.kind == "rvcModel" })
                return enough
            } else if (setting.voiceChangerType == "MMVCv13") {
                const enough = !!setting.files.find(x => { return x.kind == "mmvcv13Model" }) &&
                    !!setting.files.find(x => { return x.kind == "mmvcv13Config" })
                return enough
            } else if (setting.voiceChangerType == "MMVCv15") {
                const enough = !!setting.files.find(x => { return x.kind == "mmvcv15Model" }) &&
                    !!setting.files.find(x => { return x.kind == "mmvcv15Config" })
                return enough
            } else if (setting.voiceChangerType == "so-vits-svc-40") {
                const enough = !!setting.files.find(x => { return x.kind == "soVitsSvc40Config" }) &&
                    !!setting.files.find(x => { return x.kind == "soVitsSvc40Model" })
                return enough
            } else if (setting.voiceChangerType == "DDSP-SVC") {
                const enough = !!setting.files.find(x => { return x.kind == "ddspSvcModel" }) &&
                    !!setting.files.find(x => { return x.kind == "ddspSvcModelConfig" }) &&
                    !!setting.files.find(x => { return x.kind == "ddspSvcDiffusion" }) &&
                    !!setting.files.find(x => { return x.kind == "ddspSvcDiffusionConfig" })
                return enough
            }
            return false
        }
-        const generateFileRow = (setting: ModelUploadSetting, title: string, kind: ModelFileKind, ext: string[]) => {
+        const generateFileRow = (setting: ModelUploadSetting, title: string, kind: ModelFileKind, ext: string[], dir: string = "") => {
            const selectedFile = setting.files.find(x => { return x.kind == kind })
            const selectedFilename = selectedFile?.file.name || ""
            return (
@ -81,7 +97,7 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
                        if (selectedFile) {
                            selectedFile.file = file
                        } else {
-                            setting.files.push({ kind: kind, file: file })
+                            setting.files.push({ kind: kind, file: file, dir: dir })
                        }
                        setUploadSetting({ ...setting })
                    }}>
@ -96,6 +112,21 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
            if (vcType == "RVC") {
                rows.push(generateFileRow(uploadSetting!, "Model", "rvcModel", ["pth", "onnx"]))
                rows.push(generateFileRow(uploadSetting!, "Index", "rvcIndex", ["index", "bin"]))
            } else if (vcType == "MMVCv13") {
                rows.push(generateFileRow(uploadSetting!, "Config", "mmvcv13Config", ["json"]))
                rows.push(generateFileRow(uploadSetting!, "Model", "mmvcv13Model", ["pth", "onnx"]))
            } else if (vcType == "MMVCv15") {
                rows.push(generateFileRow(uploadSetting!, "Config", "mmvcv15Config", ["json"]))
                rows.push(generateFileRow(uploadSetting!, "Model", "mmvcv15Model", ["pth", "onnx"]))
            } else if (vcType == "so-vits-svc-40") {
                rows.push(generateFileRow(uploadSetting!, "Config", "soVitsSvc40Config", ["json"]))
                rows.push(generateFileRow(uploadSetting!, "Model", "soVitsSvc40Model", ["pth"]))
                rows.push(generateFileRow(uploadSetting!, "Cluster", "soVitsSvc40Cluster", ["pth", "pt"]))
            } else if (vcType == "DDSP-SVC") {
                rows.push(generateFileRow(uploadSetting!, "Config", "ddspSvcModelConfig", ["yaml"], "model/"))
                rows.push(generateFileRow(uploadSetting!, "Model", "ddspSvcModel", ["pth", "pt"], "model/"))
                rows.push(generateFileRow(uploadSetting!, "Config(diff)", "ddspSvcDiffusionConfig", ["yaml"], "diff/"))
                rows.push(generateFileRow(uploadSetting!, "Model(diff)", "ddspSvcDiffusion", ["pth", "pt"], "diff/"))
            }
            return rows
        }
--- a/client/lib/src/ServerConfigurator.ts
+++ b/client/lib/src/ServerConfigurator.ts
@ -57,13 +57,13 @@ export class ServerConfigurator {
        return info
    }
-    uploadFile2 = async (file: File, onprogress: (progress: number, end: boolean) => void) => {
+    uploadFile2 = async (dir: string, file: File, onprogress: (progress: number, end: boolean) => void) => {
        const url = this.serverUrl + "/upload_file"
        onprogress(0, false)
        const size = 1024 * 1024;
        let index = 0; // index値
        const fileLength = file.size
-        const filename = file.name
+        const filename = dir + file.name
        const fileChunkNum = Math.ceil(fileLength / size)
        while (true) {
--- a/client/lib/src/VoiceChangerClient.ts
+++ b/client/lib/src/VoiceChangerClient.ts
@ -290,8 +290,8 @@ export class VoiceChangerClient {
    uploadFile = (buf: ArrayBuffer, filename: string, onprogress: (progress: number, end: boolean) => void) => {
        return this.configurator.uploadFile(buf, filename, onprogress)
    }
-    uploadFile2 = (file: File, onprogress: (progress: number, end: boolean) => void) => {
+    uploadFile2 = (dir: string, file: File, onprogress: (progress: number, end: boolean) => void) => {
-        return this.configurator.uploadFile2(file, onprogress)
+        return this.configurator.uploadFile2(dir, file, onprogress)
    }
    concatUploadedFile = (filename: string, chunkNum: number) => {
        return this.configurator.concatUploadedFile(filename, chunkNum)
--- a/client/lib/src/hooks/useServerSetting.ts
+++ b/client/lib/src/hooks/useServerSetting.ts
@ -41,6 +41,7 @@ export type ModelFileKind = typeof ModelFileKind[keyof typeof ModelFileKind]
 export type ModelFile = {
    file: File,
    kind: ModelFileKind
    dir: string
 }
 export type ModelUploadSetting = {
@ -296,7 +297,7 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
            if (!props.voiceChangerClient) return
            console.log("uploading..1.", file)
            console.log("uploading..2.", file.name)
-            const num = await props.voiceChangerClient.uploadFile2(file, onprogress)
+            const num = await props.voiceChangerClient.uploadFile2(dir, file, onprogress)
            const res = await props.voiceChangerClient.concatUploadedFile(dir + file.name, num)
            console.log("uploaded", num, res)
        }
@ -319,11 +320,11 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
                    const progOffset = 100 * i * progRate
                    await _uploadFile2(setting.files[i].file, (progress: number, _end: boolean) => {
                        setUploadProgress(progress * progRate + progOffset)
-                    })
+                    }, setting.files[i].dir)
                }
            }
            const params: ModelUploadSettingForServer = {
-                ...setting, files: setting.files.map((f) => { return { name: f.file.name, kind: f.kind } })
+                ...setting, files: setting.files.map((f) => { return { name: f.file.name, kind: f.kind, dir: f.dir } })
            }
            const loadPromise = props.voiceChangerClient.loadModel(
--- a/server/data/ModelSlot.py
+++ b/server/data/ModelSlot.py
@ -38,7 +38,72 @@ class RVCModelSlot(ModelSlot):
    iconFile: str = ""
-ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot]
+@dataclass
 class MMVCv13ModelSlot(ModelSlot):
    voiceChangerType: VoiceChangerType = "MMVCv13"
    modelFile: str = ""
    configFile: str = ""
    srcId: int = 107
    dstId: int = 100
    isONNX: bool = False
    samplingRate: int = 24000
    name: str = ""
    description: str = ""
    iconFile: str = ""
@dataclass
 class MMVCv15ModelSlot(ModelSlot):
    voiceChangerType: VoiceChangerType = "MMVCv15"
    modelFile: str = ""
    configFile: str = ""
    srcId: int = 0
    dstId: int = 101
    isONNX: bool = False
    samplingRate: int = 24000
    name: str = ""
    description: str = ""
    iconFile: str = ""
@dataclass
 class SoVitsSvc40ModelSlot(ModelSlot):
    voiceChangerType: VoiceChangerType = "so-vits-svc-40"
    modelFile: str = ""
    configFile: str = ""
    clusterFile: str = ""
    dstId: int = 0
    isONNX: bool = False
    name: str = ""
    description: str = ""
    credit: str = ""
    termsOfUseUrl: str = ""
    sampleId: str = ""
    iconFile: str = ""
@dataclass
 class DDSPSVCModelSlot(ModelSlot):
    voiceChangerType: VoiceChangerType = "DDSP-SVC"
    modelFile: str = ""
    configFile: str = ""
    diffModelFile: str = ""
    diffConfigFile: str = ""
    dstId: int = 0
    isONNX: bool = False
    name: str = ""
    description: str = ""
    credit: str = ""
    termsOfUseUrl: str = ""
    sampleId: str = ""
    iconFile: str = ""
 ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot]
 def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
@ -50,6 +115,14 @@ def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
    slotInfo = ModelSlot(**{k: v for k, v in jsonDict.items() if k in ModelSlot.__annotations__})
    if slotInfo.voiceChangerType == "RVC":
        return RVCModelSlot(**jsonDict)
    elif slotInfo.voiceChangerType == "MMVCv13":
        return MMVCv13ModelSlot(**jsonDict)
    elif slotInfo.voiceChangerType == "MMVCv15":
        return MMVCv15ModelSlot(**jsonDict)
    elif slotInfo.voiceChangerType == "so-vits-svc-40":
        return SoVitsSvc40ModelSlot(**jsonDict)
    elif slotInfo.voiceChangerType == "DDSP-SVC":
        return DDSPSVCModelSlot(**jsonDict)
    else:
        return ModelSlot()
--- a/server/voice_changer/DDSP_SVC/DDSP_SVC.py
+++ b/server/voice_changer/DDSP_SVC/DDSP_SVC.py
@ -3,6 +3,7 @@ import os
 from dataclasses import asdict
 import numpy as np
 import torch
 from data.ModelSlot import DDSPSVCModelSlot
 from voice_changer.DDSP_SVC.ModelSlot import ModelSlot
 from voice_changer.DDSP_SVC.deviceManager.DeviceManager import DeviceManager
@ -21,7 +22,7 @@ from diffusion.infer_gt_mel import DiffGtMel  # type: ignore
 from voice_changer.utils.VoiceChangerModel import AudioInOut
 from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
-from voice_changer.utils.LoadModelParams import LoadModelParams
+from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2
 from voice_changer.DDSP_SVC.DDSP_SVCSetting import DDSP_SVCSettings
 from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
@ -44,11 +45,7 @@ def phase_vocoder(a, b, fade_out, fade_in):
    deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
    w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
    t = torch.arange(n).unsqueeze(-1).to(a) / n
-    result = (
+    result = a * (fade_out**2) + b * (fade_in**2) + torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n
        a * (fade_out**2)
        + b * (fade_in**2)
        + torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n
    )
    return result
@ -102,9 +99,7 @@ class DDSP_SVC:
    def reloadModel(self):
        self.device = self.deviceManager.getDevice(self.settings.gpu)
        modelFile = self.settings.modelSlots[self.settings.modelSlotIndex].modelFile
-        diffusionFile = self.settings.modelSlots[
+        diffusionFile = self.settings.modelSlots[self.settings.modelSlotIndex].diffusionFile
            self.settings.modelSlotIndex
        ].diffusionFile
        self.svc_model = SvcDDSP()
        self.svc_model.setVCParams(self.params)
@ -144,15 +139,11 @@ class DDSP_SVC:
        # newData = newData.astype(np.float32)
        if self.audio_buffer is not None:
-            self.audio_buffer = np.concatenate(
+            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)  # 過去のデータに連結
                [self.audio_buffer, newData], 0
            )  # 過去のデータに連結
        else:
            self.audio_buffer = newData
-        convertSize = (
+        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
            inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
        )
        # if convertSize % self.hop_size != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
        #     convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size))
@ -187,8 +178,7 @@ class DDSP_SVC:
            f0_min=50,
            f0_max=1100,
            # safe_prefix_pad_length=0,  # TBD なにこれ？
-            safe_prefix_pad_length=self.settings.extraConvertSize
+            safe_prefix_pad_length=self.settings.extraConvertSize / self.svc_model.args.data.sampling_rate,
            / self.svc_model.args.data.sampling_rate,
            diff_model=self.diff_model,
            diff_acc=self.settings.diffAcc,  # TBD なにこれ？
            diff_spk_id=self.settings.diffSpkId,
@ -196,9 +186,7 @@ class DDSP_SVC:
            # diff_use_dpm=True if self.settings.useDiffDpm == 1 else False,  # TBD なにこれ？
            method=self.settings.diffMethod,
            k_step=self.settings.kStep,  # TBD なにこれ？
-            diff_silence=True
+            diff_silence=True if self.settings.useDiffSilence == 1 else False,  # TBD なにこれ？
            if self.settings.useDiffSilence == 1
            else False,  # TBD なにこれ？
        )
        return _audio.cpu().numpy() * 32768.0
@ -210,9 +198,21 @@ class DDSP_SVC:
            audio = self._pyTorch_inference(data)
        return audio
-    # def destroy(self):
+    @classmethod
-    #     del self.net_g
+    def loadModel2(cls, props: LoadModelParams2):
-    #     del self.onnx_session
+        slotInfo: DDSPSVCModelSlot = DDSPSVCModelSlot()
        for file in props.files:
            if file.kind == "ddspSvcModelConfig":
                slotInfo.configFile = file.name
            elif file.kind == "ddspSvcModel":
                slotInfo.modelFile = file.name
            elif file.kind == "ddspSvcDiffusionConfig":
                slotInfo.diffConfigFile = file.name
            elif file.kind == "ddspSvcDiffusion":
                slotInfo.diffModelFile = file.name
        slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
        slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
        return slotInfo
    def __del__(self):
        del self.net_g
--- a/server/voice_changer/MMVCv13/MMVCv13.py
+++ b/server/voice_changer/MMVCv13/MMVCv13.py
@ -1,7 +1,8 @@
 import sys
 import os
 from data.ModelSlot import MMVCv13ModelSlot
-from voice_changer.utils.LoadModelParams import LoadModelParams
+from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2
 from voice_changer.utils.VoiceChangerModel import AudioInOut
 if sys.platform.startswith("darwin"):
@ -77,13 +78,7 @@ class MMVCv13:
        # PyTorchモデル生成
        if self.settings.pyTorchModelFile is not None:
-            self.net_g = SynthesizerTrn(
+            self.net_g = SynthesizerTrn(len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model)
                len(symbols),
                self.hps.data.filter_length // 2 + 1,
                self.hps.train.segment_size // self.hps.data.hop_length,
                n_speakers=self.hps.data.n_speakers,
                **self.hps.model
            )
            self.net_g.eval()
            load_checkpoint(self.settings.pyTorchModelFile, self.net_g, None)
@ -154,9 +149,7 @@ class MMVCv13:
    def get_info(self):
        data = asdict(self.settings)
-        data["onnxExecutionProviders"] = (
+        data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
            self.onnx_session.get_providers() if self.onnx_session is not None else []
        )
        files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
        for f in files:
            if data[f] is not None and os.path.exists(data[f]):
@ -193,9 +186,7 @@ class MMVCv13:
        newData = newData.astype(np.float32) / self.hps.data.max_wav_value
        if self.audio_buffer is not None:
-            self.audio_buffer = np.concatenate(
+            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)  # 過去のデータに連結
                [self.audio_buffer, newData], 0
            )  # 過去のデータに連結
        else:
            self.audio_buffer = newData
@ -204,9 +195,7 @@ class MMVCv13:
        # if convertSize < 8192:
        #     convertSize = 8192
        if convertSize % self.hps.data.hop_length != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
-            convertSize = convertSize + (
+            convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
                self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)
            )
        convertOffset = -1 * convertSize
        self.audio_buffer = self.audio_buffer[convertOffset:]  # 変換対象の部分だけ抽出
@ -238,7 +227,9 @@ class MMVCv13:
                    "sid_src": sid_src.numpy(),
                    "sid_tgt": sid_tgt1.numpy(),
                },
-            )[0][0, 0]
+            )[
                0
            ][0, 0]
            * self.hps.data.max_wav_value
        )
        return audio1
@ -254,19 +245,10 @@ class MMVCv13:
            dev = torch.device("cuda", index=self.settings.gpu)
        with torch.no_grad():
-            x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [
+            x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(dev) for x in data]
                x.to(dev) for x in data
            ]
            sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
-            audio1 = (
+            audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_target)[0, 0].data * self.hps.data.max_wav_value
                self.net_g.to(dev)
                .voice_conversion(
                    spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_target
                )[0, 0]
                .data
                * self.hps.data.max_wav_value
            )
            result = audio1.float().cpu().numpy()
        return result
@ -278,6 +260,18 @@ class MMVCv13:
            audio = self._pyTorch_inference(data)
        return audio
    @classmethod
    def loadModel2(cls, props: LoadModelParams2):
        slotInfo: MMVCv13ModelSlot = MMVCv13ModelSlot()
        for file in props.files:
            if file.kind == "mmvcv13Model":
                slotInfo.modelFile = file.name
            elif file.kind == "mmvcv13Config":
                slotInfo.configFile = file.name
        slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
        slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
        return slotInfo
    def __del__(self):
        del self.net_g
        del self.onnx_session
--- a/server/voice_changer/MMVCv15/MMVCv15.py
+++ b/server/voice_changer/MMVCv15/MMVCv15.py
@ -1,7 +1,8 @@
 import sys
 import os
 from data.ModelSlot import MMVCv15ModelSlot
-from voice_changer.utils.LoadModelParams import LoadModelParams
+from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2
 from voice_changer.utils.VoiceChangerModel import AudioInOut
 if sys.platform.startswith("darwin"):
@ -172,12 +173,7 @@ class MMVCv15:
    def get_info(self):
        data = asdict(self.settings)
-        data["onnxExecutionProviders"] = (
+        data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else []
            self.onnx_session.get_providers()
            if self.settings.onnxModelFile != ""
            and self.settings.onnxModelFile is not None
            else []
        )
        files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
        for f in files:
            if data[f] is not None and os.path.exists(data[f]):
@ -195,9 +191,7 @@ class MMVCv15:
    def _get_f0(self, detector: str, newData: AudioInOut):
        audio_norm_np = newData.astype(np.float64)
        if detector == "dio":
-            _f0, _time = pw.dio(
+            _f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
                audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5
            )
            f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
        else:
            f0, t = pw.harvest(
@ -207,9 +201,7 @@ class MMVCv15:
                f0_floor=71.0,
                f0_ceil=1000.0,
            )
-        f0 = convert_continuos_f0(
+        f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
            f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length)
        )
        f0 = torch.from_numpy(f0.astype(np.float32))
        return f0
@ -237,9 +229,7 @@ class MMVCv15:
        newData = newData.astype(np.float32) / self.hps.data.max_wav_value
        if self.audio_buffer is not None:
-            self.audio_buffer = np.concatenate(
+            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)  # 過去のデータに連結
                [self.audio_buffer, newData], 0
            )  # 過去のデータに連結
        else:
            self.audio_buffer = newData
@ -248,9 +238,7 @@ class MMVCv15:
        # if convertSize < 8192:
        #     convertSize = 8192
        if convertSize % self.hps.data.hop_length != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
-            convertSize = convertSize + (
+            convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
                self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)
            )
        # ONNX は固定長
        if self.settings.framework == "ONNX":
@ -290,16 +278,15 @@ class MMVCv15:
                    "sid_src": sid_src.numpy(),
                    "sid_tgt": sid_tgt1.numpy(),
                },
-            )[0][0, 0]
+            )[
                0
            ][0, 0]
            * self.hps.data.max_wav_value
        )
        return audio1
    def _pyTorch_inference(self, data):
-        if (
+        if self.settings.pyTorchModelFile == "" or self.settings.pyTorchModelFile is None:
            self.settings.pyTorchModelFile == ""
            or self.settings.pyTorchModelFile is None
        ):
            print("[Voice Changer] No pyTorch session.")
            raise NoModeLoadedException("pytorch")
@ -316,12 +303,7 @@ class MMVCv15:
            sid_src = sid_src.to(dev)
            sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
-            audio1 = (
+            audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value
                self.net_g.to(dev)
                .voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0, 0]
                .data
                * self.hps.data.max_wav_value
            )
            result = audio1.float().cpu().numpy()
        return result
@ -336,6 +318,18 @@ class MMVCv15:
            print(_e)
            raise ONNXInputArgumentException()
    @classmethod
    def loadModel2(cls, props: LoadModelParams2):
        slotInfo: MMVCv15ModelSlot = MMVCv15ModelSlot()
        for file in props.files:
            if file.kind == "mmvcv15Model":
                slotInfo.modelFile = file.name
            elif file.kind == "mmvcv15Config":
                slotInfo.configFile = file.name
        slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
        slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
        return slotInfo
    def __del__(self):
        del self.net_g
        del self.onnx_session
--- a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py
+++ b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py
@ -1,7 +1,8 @@
 import sys
 import os
 from data.ModelSlot import SoVitsSvc40ModelSlot
-from voice_changer.utils.LoadModelParams import LoadModelParams
+from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2
 from voice_changer.utils.VoiceChangerModel import AudioInOut
 from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
@ -97,11 +98,7 @@ class SoVitsSvc40:
            self.settings.pyTorchModelFile = modelFile
            self.settings.onnxModelFile = None
-        clusterTorchModel = (
+        clusterTorchModel = params["files"]["soVitsSvc40Cluster"] if "soVitsSvc40Cluster" in params["files"] else None
            params["files"]["soVitsSvc40Cluster"]
            if "soVitsSvc40Cluster" in params["files"]
            else None
        )
        content_vec_path = self.params.content_vec_500
        content_vec_onnx_path = self.params.content_vec_500_onnx
@ -212,9 +209,7 @@ class SoVitsSvc40:
    def get_info(self):
        data = asdict(self.settings)
-        data["onnxExecutionProviders"] = (
+        data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
            self.onnx_session.get_providers() if self.onnx_session is not None else []
        )
        files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
        for f in files:
            if data[f] is not None and os.path.exists(data[f]):
@ -246,9 +241,7 @@ class SoVitsSvc40:
            )
        if wav_44k.shape[0] % self.hps.data.hop_length != 0:
-            print(
+            print(f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}")
                f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}"
            )
        f0, uv = utils.interpolate_f0(f0)
        f0 = torch.FloatTensor(f0)
@ -257,14 +250,10 @@ class SoVitsSvc40:
        f0 = f0.unsqueeze(0)
        uv = uv.unsqueeze(0)
-        wav16k_numpy = librosa.resample(
+        wav16k_numpy = librosa.resample(audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000)
            audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000
        )
        wav16k_tensor = torch.from_numpy(wav16k_numpy)
-        if (
+        if (self.settings.gpu < 0 or self.gpu_num == 0) or self.settings.framework == "ONNX":
            self.settings.gpu < 0 or self.gpu_num == 0
        ) or self.settings.framework == "ONNX":
            dev = torch.device("cpu")
        else:
            dev = torch.device("cuda", index=self.settings.gpu)
@ -282,44 +271,27 @@ class SoVitsSvc40:
            if self.hps.model.ssl_dim == 768:
                self.hubert_model = self.hubert_model.to(dev)
                wav16k_tensor = wav16k_tensor.to(dev)
-                c = get_hubert_content_layer9(
+                c = get_hubert_content_layer9(self.hubert_model, wav_16k_tensor=wav16k_tensor)
                    self.hubert_model, wav_16k_tensor=wav16k_tensor
                )
            else:
                self.hubert_model = self.hubert_model.to(dev)
                wav16k_tensor = wav16k_tensor.to(dev)
-                c = utils.get_hubert_content(
+                c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k_tensor)
                    self.hubert_model, wav_16k_tensor=wav16k_tensor
                )
        uv = uv.to(dev)
        f0 = f0.to(dev)
        c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
-        if (
+        if self.settings.clusterInferRatio != 0 and hasattr(self, "cluster_model") and self.cluster_model is not None:
-            self.settings.clusterInferRatio != 0
+            speaker = [key for key, value in self.settings.speakers.items() if value == self.settings.dstId]
            and hasattr(self, "cluster_model")
            and self.cluster_model is not None
        ):
            speaker = [
                key
                for key, value in self.settings.speakers.items()
                if value == self.settings.dstId
            ]
            if len(speaker) != 1:
                pass
                # print("not only one speaker found.", speaker)
            else:
-                cluster_c = cluster.get_cluster_center_result(
+                cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker[0]).T
                    self.cluster_model, c.cpu().numpy().T, speaker[0]
                ).T
                cluster_c = torch.FloatTensor(cluster_c).to(dev)
                c = c.to(dev)
-                c = (
+                c = self.settings.clusterInferRatio * cluster_c + (1 - self.settings.clusterInferRatio) * c
                    self.settings.clusterInferRatio * cluster_c
                    + (1 - self.settings.clusterInferRatio) * c
                )
        c = c.unsqueeze(0)
        return c, f0, uv
@ -334,20 +306,14 @@ class SoVitsSvc40:
        newData = newData.astype(np.float32) / self.hps.data.max_wav_value
        if self.audio_buffer is not None:
-            self.audio_buffer = np.concatenate(
+            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)  # 過去のデータに連結
                [self.audio_buffer, newData], 0
            )  # 過去のデータに連結
        else:
            self.audio_buffer = newData
-        convertSize = (
+        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
            inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
        )
        if convertSize % self.hps.data.hop_length != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
-            convertSize = convertSize + (
+            convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
                self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)
            )
        convertOffset = -1 * convertSize
        self.audio_buffer = self.audio_buffer[convertOffset:]  # 変換対象の部分だけ抽出
@ -389,9 +355,7 @@ class SoVitsSvc40:
                    "f0": f0.astype(np.float32),
                    "uv": uv.astype(np.float32),
                    "g": sid_target.astype(np.int64),
-                    "noise_scale": np.array([self.settings.noiseScale]).astype(
+                    "noise_scale": np.array([self.settings.noiseScale]).astype(np.float32),
                        np.float32
                    ),
                    # "predict_f0": np.array([self.settings.dstId]).astype(np.int64),
                },
            )[0][0, 0]
@ -457,6 +421,20 @@ class SoVitsSvc40:
        return audio
    @classmethod
    def loadModel2(cls, props: LoadModelParams2):
        slotInfo: SoVitsSvc40ModelSlot = SoVitsSvc40ModelSlot()
        for file in props.files:
            if file.kind == "soVitsSvc40Config":
                slotInfo.configFile = file.name
            elif file.kind == "soVitsSvc40Model":
                slotInfo.modelFile = file.name
            elif file.kind == "soVitsSvc40Cluster":
                slotInfo.clusterFile = file.name
        slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
        slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
        return slotInfo
    def __del__(self):
        del self.net_g
        del self.onnx_session
--- a/server/voice_changer/VoiceChangerManager.py
+++ b/server/voice_changer/VoiceChangerManager.py
@ -95,22 +95,48 @@ class VoiceChangerManager(ServerDeviceCallbacks):
            # Dataを展開
            params = LoadModelParams2(**paramDict)
            params.files = [LoadModelParamFile(**x) for x in paramDict["files"]]
            # ファイルをslotにコピー
            for file in params.files:
                print("FILE", file)
-                srcPath = os.path.join(UPLOAD_DIR, file.name)
+                srcPath = os.path.join(UPLOAD_DIR, file.dir, file.name)
-                dstDir = os.path.join(self.params.model_dir, str(params.slot))
+                dstDir = os.path.join(
                    self.params.model_dir,
                    str(params.slot),
                    file.dir,
                )
                dstPath = os.path.join(dstDir, file.name)
                os.makedirs(dstDir, exist_ok=True)
                print(f"move to {srcPath} -> {dstPath}")
                shutil.move(srcPath, dstPath)
                file.name = dstPath
            # メタデータ作成(各VCで定義)
            if params.voiceChangerType == "RVC":
                from voice_changer.RVC.RVC import RVC  # 起動時にインポートするとパラメータが取れない。
                slotInfo = RVC.loadModel2(params)
                self.modelSlotManager.save_model_slot(params.slot, slotInfo)
            elif params.voiceChangerType == "MMVCv13":
                from voice_changer.MMVCv13.MMVCv13 import MMVCv13
                slotInfo = MMVCv13.loadModel2(params)
                self.modelSlotManager.save_model_slot(params.slot, slotInfo)
            elif params.voiceChangerType == "MMVCv15":
                from voice_changer.MMVCv15.MMVCv15 import MMVCv15
                slotInfo = MMVCv15.loadModel2(params)
                self.modelSlotManager.save_model_slot(params.slot, slotInfo)
            elif params.voiceChangerType == "so-vits-svc-40":
                from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40
                slotInfo = SoVitsSvc40.loadModel2(params)
                self.modelSlotManager.save_model_slot(params.slot, slotInfo)
            elif params.voiceChangerType == "DDSP-SVC":
                from voice_changer.DDSP_SVC.DDSP_SVC import DDSP_SVC
                slotInfo = DDSP_SVC.loadModel2(params)
                self.modelSlotManager.save_model_slot(params.slot, slotInfo)
            print("params", params)
        else:
--- a/server/voice_changer/utils/LoadModelParams.py
+++ b/server/voice_changer/utils/LoadModelParams.py
@ -43,6 +43,7 @@ LoadModelParamFileKind: TypeAlias = Literal[
 class LoadModelParamFile:
    name: str
    kind: LoadModelParamFileKind
    dir: str
@dataclass