skipdiffusion

2023-08-06 04:50:42 +09:00 · 2023-08-06 04:50:42 +09:00 · e18138b5d6
commit e18138b5d6
parent 6d4c138821
8 changed files with 78 additions and 16 deletions
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx
+++ b/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx
@ -23,6 +23,26 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
            return <></>;
        }
        const skipDiffusionClass = serverSetting.serverSetting.skipDiffusion == 0 ? "character-area-toggle-button" : "character-area-toggle-button-active";
        const skipDiffRow = (
            <div className="character-area-control">
                <div className="character-area-control-title">Boost</div>
                <div className="character-area-control-field">
                    <div className="character-area-buttons">
                        <div
                            className={skipDiffusionClass}
                            onClick={() => {
                                serverSetting.updateServerSettings({ ...serverSetting.serverSetting, skipDiffusion: serverSetting.serverSetting.skipDiffusion == 0 ? 1 : 0 });
                            }}
                        >
                            skip diff
                        </div>
                    </div>
                </div>
            </div>
        );
        const skipValues = getDivisors(serverSetting.serverSetting.kStep);
        skipValues.pop();
@ -82,6 +102,7 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
        );
        return (
            <>
                {skipDiffRow}
                {kStepRow}
                {speedUpRow}
            </>
--- a/client/demo/src/css/App.css
+++ b/client/demo/src/css/App.css
@ -1338,6 +1338,7 @@ body {
        .character-area-control {
            display: flex;
            gap: 3px;
            align-items: center;
            .character-area-control-buttons {
                display: flex;
                flex-direction: row;
@ -1405,6 +1406,35 @@ body {
                    .character-area-button:hover {
                        border: solid 2px #faa;
                    }
                    .character-area-toggle-button {
                        border: solid 2px #999;
                        color: white;
                        background: #666;
                        cursor: pointer;
                        font-size: 0.8rem;
                        border-radius: 5px;
                        height: 1.2rem;
                        padding-left: 2px;
                        padding-right: 2px;
                    }
                    .character-area-toggle-button:hover {
                        border: solid 2px #faa;
                    }
                    .character-area-toggle-button-active {
                        border: solid 2px #999;
                        color: white;
                        background: #844;
                        cursor: pointer;
                        font-size: 0.8rem;
                        border-radius: 5px;
                        height: 1.2rem;
                        padding-left: 2px;
                        padding-right: 2px;
                    }
                }
            }
        }
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -124,6 +124,7 @@ export const ServerSettingKey = {
    "threshold": "threshold",
    "speedUp": "speedUp",
    "skipDiffusion": "skipDiffusion",
    "inputSampleRate": "inputSampleRate",
    "enableDirectML": "enableDirectML",
@ -186,7 +187,7 @@ export type VoiceChangerServerSetting = {
    threshold: number// DDSP-SVC
    speedUp: number // Diffusion-SVC
-
+    skipDiffusion: number // Diffusion-SVC 0:off, 1:on
    inputSampleRate: InputSampleRate
    enableDirectML: number
@ -403,6 +404,7 @@ export const DefaultServerSetting: ServerInfo = {
    threshold: -45,
    speedUp: 10,
    skipDiffusion: 1,
    enableDirectML: 0,
    // 
--- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py
+++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py
@ -180,7 +180,8 @@ class DiffusionSVC(VoiceChangerModel):
                silenceFrontSec,
                embOutputLayer,
                useFinalProj,
-                protect
+                protect,
                skip_diffusion=self.settings.skipDiffusion,
            )
            result = audio_out.detach().cpu().numpy()
            return result
--- a/server/voice_changer/DiffusionSVC/DiffusionSVCSettings.py
+++ b/server/voice_changer/DiffusionSVC/DiffusionSVCSettings.py
@ -13,7 +13,7 @@ class DiffusionSVCSettings:
    kStep: int = 20
    speedUp: int = 10
-    skipDiffusion: int = 0  # 0:off, 1:on
+    skipDiffusion: int = 1  # 0:off, 1:on
    silenceFront: int = 1  # 0:off, 1:on
    modelSamplingRate: int = 44100
@ -30,6 +30,7 @@ class DiffusionSVCSettings:
        "kStep",
        "speedUp",
        "silenceFront",
        "skipDiffusion",
    ]
    floatData = ["silentThreshold"]
    strData = ["f0Detector"]
--- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
+++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
@ -112,25 +112,27 @@ class DiffusionSVCInferencer(Inferencer):
        k_step: int,
        infer_speedup: int,
        silence_front: float,
        skip_diffusion: bool = True,
    ) -> torch.Tensor:
        with Timer("pre-process", False) as t:
            gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
            # gt_spec = self.vocoder.extract(audio_t, 16000)
            # gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
        # print("[    ----Timer::1: ]", t.secs)
-        with Timer("pre-process", False) as t:
+        with Timer("pre-process", True) as t:
-            out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
+            if skip_diffusion == 0:
                out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
                gt_spec = out_mel
        print("[    ----Timer::2: ]", t.secs)
        # print("[    ----Timer::2: ]", t.secs)
        with Timer("pre-process", False) as t:  # NOQA
            if self.vocoder_onnx is None:
                start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
-                out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
+                out_wav = self.mel2wav(gt_spec, pitch, start_frame=start_frame)
                out_wav *= mask
            else:
-                out_wav = self.vocoder_onnx.infer(out_mel, pitch, silence_front, mask)
+                out_wav = self.vocoder_onnx.infer(gt_spec, pitch, silence_front, mask)
        # print("[    ----Timer::3: ]", t.secs)
        return out_wav.squeeze()
--- a/server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
+++ b/server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
@ -21,11 +21,16 @@ class Inferencer(Protocol):
    def infer(
        self,
        audio_t: torch.Tensor,
        feats: torch.Tensor,
-        pitch_length: torch.Tensor,
+        pitch: torch.Tensor,
-        pitch: torch.Tensor | None,
+        volume: torch.Tensor,
-        pitchf: torch.Tensor | None,
+        mask: torch.Tensor,
        sid: torch.Tensor,
        k_step: int,
        infer_speedup: int,
        silence_front: float,
        skip_diffusion: bool = True,
    ) -> torch.Tensor:
        ...