WIP: so-vits-svc 40v2, alpha (before refactoring)

2023-03-13 00:54:49 +09:00 · 2023-03-13 00:54:49 +09:00 · ca4d454d1a
commit ca4d454d1a
parent 01f64cc8ec
5 changed files with 76 additions and 17 deletions
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -70,6 +70,7 @@ export const ServerSettingKey = {
    "noiceScale": "noiceScale",
    "predictF0": "predictF0",
    "silentThreshold": "silentThreshold",
+    "processingLength": "processingLength",

    "inputSampleRate": "inputSampleRate",
 } as const
@ -96,6 +97,7 @@ export type VoiceChangerServerSetting = {
    noiceScale: number // so-vits-svc
    predictF0: number // so-vits-svc
    silentThreshold: number // so-vits-svc
+    processingLength: number// so-vits-svc

    inputSampleRate: InputSampleRate
 }
@ -127,6 +129,7 @@ export const DefaultServerSetting_MMVCv15: ServerInfo = {
    noiceScale: 0,
    predictF0: 0,
    silentThreshold: 0,
+    processingLength: 0,

    inputSampleRate: 24000,

@ -157,6 +160,7 @@ export const DefaultServerSetting_MMVCv13: ServerInfo = {
    noiceScale: 0,
    predictF0: 0,
    silentThreshold: 0,
+    processingLength: 0,

    inputSampleRate: 24000,

@ -183,14 +187,15 @@ export const DefaultServerSetting_so_vits_svc_40v2: ServerInfo = {
    f0Detector: F0Detector.dio,
    recordIO: 0,

-    tran: 0,
-    noiceScale: 0,
-    predictF0: 0,
-    silentThreshold: 0,
-    // tran: 10,
-    // noiceScale: 0.3,
+    // tran: 0,
+    // noiceScale: 0,
    // predictF0: 0,
-    // silentThreshold: 0.00001,
+    // silentThreshold: 0,
+    tran: 10,
+    noiceScale: 0.3,
+    predictF0: 0,
+    silentThreshold: 0.00001,
+    processingLength: 1024 * 32,

    inputSampleRate: 24000,

@ -254,6 +259,14 @@ export const DefaultWorkletNodeSetting: WorkletNodeSetting = {
    downSamplingMode: "average"
 }

+export const DefaultWorkletNodeSetting_so_vits_svc_40v2: WorkletNodeSetting = {
+    serverUrl: "",
+    protocol: "sio",
+    sendingSampleRate: 24000,
+    inputChunkNum: 128,
+    downSamplingMode: "average"
+}
+
 ///////////////////////
 // クライアントセッティング
 ///////////////////////
--- a/client/lib/src/hooks/useWorkletNodeSetting.ts
+++ b/client/lib/src/hooks/useWorkletNodeSetting.ts
@ -1,6 +1,6 @@
 import { useState, useMemo, useEffect } from "react"

-import { ClientType, DefaultWorkletNodeSetting, INDEXEDDB_KEY_WORKLETNODE, WorkletNodeSetting } from "../const"
+import { ClientType, DefaultWorkletNodeSetting, DefaultWorkletNodeSetting_so_vits_svc_40v2, INDEXEDDB_KEY_WORKLETNODE, WorkletNodeSetting } from "../const"
 import { VoiceChangerClient } from "../VoiceChangerClient"
 import { useIndexedDB } from "./useIndexedDB"

@ -19,7 +19,19 @@ export type WorkletNodeSettingState = {
 }

 export const useWorkletNodeSetting = (props: UseWorkletNodeSettingProps): WorkletNodeSettingState => {
-    const [workletNodeSetting, _setWorkletNodeSetting] = useState<WorkletNodeSetting>(DefaultWorkletNodeSetting)
+    const defaultWorkletNodeSetting = useMemo(() => {
+        if (props.clientType == "MMVCv13") {
+            return DefaultWorkletNodeSetting
+        } else if (props.clientType == "MMVCv15") {
+            return DefaultWorkletNodeSetting
+        } else if (props.clientType == "so_vits_svc_40v2c") {
+            return DefaultWorkletNodeSetting_so_vits_svc_40v2
+        } else {
+            return DefaultWorkletNodeSetting
+        }
+    }, [])
+
+    const [workletNodeSetting, _setWorkletNodeSetting] = useState<WorkletNodeSetting>(defaultWorkletNodeSetting)
    const { setItem, getItem, removeItem } = useIndexedDB({ clientType: props.clientType })

    // 初期化 その１ DBから取得
--- a/client/so-vits-svc_40v2/dist/index.js
+++ b/client/so-vits-svc_40v2/dist/index.js
--- a/client/so-vits-svc_40v2/src/106_convert_setting.tsx
+++ b/client/so-vits-svc_40v2/src/106_convert_setting.tsx
@ -25,20 +25,48 @@ export const useConvertSetting = (): ConvertSettingState => {
            <div className="body-row split-3-2-1-4 left-padding-1 guided">
                <div className="body-item-title left-padding-1">Input Chunk Num(128sample/chunk)</div>
                <div className="body-input-container">
-                    <input type="number" min={1} max={256} step={1} value={appState.workletNodeSetting.workletNodeSetting.inputChunkNum} onChange={(e) => {
+                    <select className="body-select" value={appState.workletNodeSetting.workletNodeSetting.inputChunkNum} onChange={(e) => {
                        appState.workletNodeSetting.updateWorkletNodeSetting({ ...appState.workletNodeSetting.workletNodeSetting, inputChunkNum: Number(e.target.value) })
                        appState.workletNodeSetting.trancateBuffer()
-                    }} />
+                    }}>
+                        {
+                            [32, 64, 96, 128, 160, 192, 256, 384, 512].map(x => {
+                                return <option key={x} value={x}>{x}</option>
+                            })
+                        }
+                    </select>
                </div>
                <div className="body-item-text">
                    <div>buff: {(appState.workletNodeSetting.workletNodeSetting.inputChunkNum * 128 * 1000 / 48000).toFixed(1)}ms</div>
                </div>
                <div className="body-item-text"></div>
-
            </div>
        )
    }, [appState.workletNodeSetting.workletNodeSetting.inputChunkNum, appState.workletNodeSetting.updateWorkletNodeSetting])

+    const processingLengthRow = useMemo(() => {
+        return (
+            <div className="body-row split-3-2-1-4 left-padding-1 guided">
+                <div className="body-item-title left-padding-1">Processing Length</div>
+                <div className="body-input-container">
+                    <select className="body-select" value={appState.serverSetting.serverSetting.processingLength} onChange={(e) => {
+                        appState.serverSetting.updateServerSettings({ ...appState.serverSetting.serverSetting, processingLength: Number(e.target.value) })
+                        appState.workletNodeSetting.trancateBuffer()
+                    }}>
+                        {
+                            [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32, 1024 * 64, 1024 * 128].map(x => {
+                                return <option key={x} value={x}>{x}</option>
+                            })
+                        }
+                    </select>
+                </div>
+                <div className="body-item-text">
+                </div>
+                <div className="body-item-text"></div>
+            </div>
+        )
+    }, [appState.serverSetting.serverSetting, appState.serverSetting.updateServerSettings])
+
    const gpuRow = useMemo(() => {
        return (
            <div className="body-row split-3-7 left-padding-1 guided">
@ -69,12 +97,13 @@ export const useConvertSetting = (): ConvertSettingState => {

                    <div className="partition-content">
                        {inputChunkNumRow}
+                        {processingLengthRow}
                        {gpuRow}
                    </div>
                </div>
            </>
        )
-    }, [inputChunkNumRow, gpuRow])
+    }, [inputChunkNumRow, processingLengthRow, gpuRow])

    return {
        convertSetting,
--- a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
+++ b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
@ -36,6 +36,7 @@ class SoVitsSvc40v2Settings():
    noiceScale: float = 0.3
    predictF0: int = 0  # 0:False, 1:True
    silentThreshold: float = 0.00001
+    processingLength: int = 1024 * 32

    framework: str = "PyTorch"  # PyTorch or ONNX
    pyTorchModelFile: str = ""
@ -43,7 +44,7 @@ class SoVitsSvc40v2Settings():
    configFile: str = ""

    # ↓mutableな物だけ列挙
-    intData = ["gpu", "dstId", "tran", "predictF0"]
+    intData = ["gpu", "dstId", "tran", "predictF0", "processingLength"]
    floatData = ["noiceScale", "silentThreshold"]
    strData = ["framework", "f0Detector"]

@ -170,7 +171,11 @@ class SoVitsSvc40v2:
        else:
            self.audio_buffer = newData

-        self.audio_buffer = self.audio_buffer[-(convertSize):]  # 変換対象の部分だけ抽出
+        # self.audio_buffer = self.audio_buffer[-(convertSize):]  # 変換対象の部分だけ抽出
+        # self.audio_buffer = self.audio_buffer[-1024 * 32:]  # 変換対象の部分だけ抽出
+        # self.audio_buffer = self.audio_buffer[-1024 * 128:]  # 変換対象の部分だけ抽出
+        # self.audio_buffer = self.audio_buffer[(-1 * 1024 * 32) + (-1 * convertSize):]  # 変換対象の部分だけ抽出
+        self.audio_buffer = self.audio_buffer[-1 * self.settings.processingLength + (-1 * convertSize):]  # 変換対象の部分だけ抽出

        crop = self.audio_buffer[cropRange[0]:cropRange[1]]