From 6b9777f3a272f459a1f94ff3eab8cc3912f75e45 Mon Sep 17 00:00:00 2001
From: nadare <1na2da0re3@gmail.com>
Date: Wed, 31 May 2023 23:50:43 +0900
Subject: [PATCH] =?UTF-8?q?protect=20+=20crepe=E5=AF=BE=E5=BF=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 server/voice_changer/RVC/ModelSlot.py         |  1 +
 .../voice_changer/RVC/ModelSlotGenerator.py   |  3 +++
 server/voice_changer/RVC/RVC.py               |  6 ++++++
 server/voice_changer/RVC/RVCSettings.py       |  3 ++-
 server/voice_changer/RVC/SampleDownloader.py  |  1 +
 .../RVC/modelMerger/MergeModelRequest.py      |  1 +
 server/voice_changer/RVC/pipeline/Pipeline.py | 20 +++++++++++++++++--
 .../RVC/pitchExtractor/CrepePitchExtractor.py |  7 +++++--
 8 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/server/voice_changer/RVC/ModelSlot.py b/server/voice_changer/RVC/ModelSlot.py
index b008f943..856652f7 100644
--- a/server/voice_changer/RVC/ModelSlot.py
+++ b/server/voice_changer/RVC/ModelSlot.py
@@ -12,6 +12,7 @@ class ModelSlot:
     indexFile: str = ""
     defaultTune: int = 0
     defaultIndexRatio: int = 1
+    # defaultProtect: float = .5
     isONNX: bool = False
     modelType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC
     samplingRate: int = -1
diff --git a/server/voice_changer/RVC/ModelSlotGenerator.py b/server/voice_changer/RVC/ModelSlotGenerator.py
index 03c0c28b..41f1b43a 100644
--- a/server/voice_changer/RVC/ModelSlotGenerator.py
+++ b/server/voice_changer/RVC/ModelSlotGenerator.py
@@ -36,6 +36,9 @@ def generateModelSlot(slotDir: str):
         modelSlot.defaultIndexRatio = (
             params["defaultIndexRatio"] if "defaultIndexRatio" in params else 0
         )
+        # modelSlot.defaultProtect = (
+        #     params["defaultProtect"] if "defaultProtect" in params else 0.5
+        # )
         modelSlot.name = params["name"] if "name" in params else None
         modelSlot.description = params["description"] if "description" in params else None
         modelSlot.credit = params["credit"] if "credit" in params else None
diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py
index 09003cc5..1871bd28 100644
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@@ -242,6 +242,7 @@ class RVC:
         # その他の設定
         self.next_trans = modelSlot.defaultTune
         self.next_index_ratio = modelSlot.defaultIndexRatio
+        # self.next_protect = modelSlot.defaultProtect
         self.next_samplingRate = modelSlot.samplingRate
         self.next_framework = "ONNX" if modelSlot.isONNX else "PyTorch"
         # self.needSwitch = True
@@ -254,6 +255,7 @@ class RVC:
         self.pipeline = self.next_pipeline
         self.settings.tran = self.next_trans
         self.settings.indexRatio = self.next_index_ratio
+        # self.settings.protect = self.next_protect
         self.settings.modelSamplingRate = self.next_samplingRate
         self.settings.framework = self.next_framework
 
@@ -336,6 +338,7 @@ class RVC:
         sid = 0
         f0_up_key = self.settings.tran
         index_rate = self.settings.indexRatio
+        protect = .5# self.settings.protect
         if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
         embOutputLayer = self.settings.modelSlots[self.currentSlot].embOutputLayer
         useFinalProj = self.settings.modelSlots[self.currentSlot].useFinalProj
@@ -350,6 +353,7 @@ class RVC:
             embOutputLayer,
             useFinalProj,
             repeat,
+            protect
         )
 
         result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
@@ -411,6 +415,7 @@ class RVC:
         params = {
             "defaultTune": req.defaultTune,
             "defaultIndexRatio": req.defaultIndexRatio,
+            # "defaultProtect": req.defaultProtect
             "sampleId": "",
             "files": {"rvcModel": storeFile},
         }
@@ -432,6 +437,7 @@ class RVC:
         )
         params["defaultTune"] = self.settings.tran
         params["defaultIndexRatio"] = self.settings.indexRatio
+        # params["defaultProtect"] = self.settings.protect
 
         json.dump(params, open(os.path.join(slotDir, "params.json"), "w"))
         self.loadSlots()
diff --git a/server/voice_changer/RVC/RVCSettings.py b/server/voice_changer/RVC/RVCSettings.py
index 23153558..c74979a9 100644
--- a/server/voice_changer/RVC/RVCSettings.py
+++ b/server/voice_changer/RVC/RVCSettings.py
@@ -29,6 +29,7 @@ class RVCSettings:
     sampleModels: list[RVCModelSample] = field(default_factory=lambda: [])
 
     indexRatio: float = 0
+    # protect: float = 0.5
     rvcQuality: int = 0
     silenceFront: int = 1  # 0:off, 1:on
     modelSamplingRate: int = 48000
@@ -50,5 +51,5 @@ class RVCSettings:
         "isHalf",
         "enableDirectML",
     ]
-    floatData = ["silentThreshold", "indexRatio"]
+    floatData = ["silentThreshold", "indexRatio"]  # , "protect"]
     strData = ["framework", "f0Detector"]
diff --git a/server/voice_changer/RVC/SampleDownloader.py b/server/voice_changer/RVC/SampleDownloader.py
index a0e4daf1..cfc35fab 100644
--- a/server/voice_changer/RVC/SampleDownloader.py
+++ b/server/voice_changer/RVC/SampleDownloader.py
@@ -81,6 +81,7 @@ def downloadInitialSampleModels(sampleJsons: list[str], model_dir: str):
         sampleParams["sampleId"] = sample.id
         sampleParams["defaultTune"] = 0
         sampleParams["defaultIndexRatio"] = 1
+        # sampleParams["defaultProtect"] = 0.5
         sampleParams["credit"] = sample.credit
         sampleParams["description"] = sample.description
         sampleParams["name"] = sample.name
diff --git a/server/voice_changer/RVC/modelMerger/MergeModelRequest.py b/server/voice_changer/RVC/modelMerger/MergeModelRequest.py
index 86eb322d..68cd6a71 100644
--- a/server/voice_changer/RVC/modelMerger/MergeModelRequest.py
+++ b/server/voice_changer/RVC/modelMerger/MergeModelRequest.py
@@ -17,4 +17,5 @@ class MergeModelRequest:
     slot: int = -1
     defaultTune: int = 0
     defaultIndexRatio: int = 1
+    # defaultProtect: float = .5
     files: List[MergeFile] = field(default_factory=lambda: [])
diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py
index e44d0e6c..d5f17d91 100644
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@@ -85,7 +85,9 @@ class Pipeline(object):
         embOutputLayer,
         useFinalProj,
         repeat,
+        protect=0.5,
     ):
+        search_index = self.index is not None and self.big_npy is not None and index_rate != 0
         self.t_pad = self.sr * repeat
         self.t_pad_tgt = self.targetSR * repeat
 
@@ -136,10 +138,12 @@ class Pipeline(object):
                 raise DeviceChangingException()
             else:
                 raise e
+        if protect < 0.5 and search_index:
+             feats0 = feats.clone()   
 
         # Index - feature抽出
         # if self.index is not None and self.feature is not None and index_rate != 0:
-        if self.index is not None and self.big_npy is not None and index_rate != 0:
+        if search_index:
             npy = feats[0].cpu().numpy()
             if self.isHalf is True:
                 npy = npy.astype("float32")
@@ -165,7 +169,10 @@ class Pipeline(object):
                 + (1 - index_rate) * feats
             )
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
-
+        if protect < 0.5 and search_index:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
         # ピッチサイズ調整
         p_len = audio_pad.shape[0] // self.window
         if feats.shape[1] < p_len:
@@ -173,6 +180,15 @@ class Pipeline(object):
             if pitch is not None and pitchf is not None:
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
+
+        # pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
+        if protect < 0.5 and search_index:
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats = feats.to(feats0.dtype)
         p_len = torch.tensor([p_len], device=self.device).long()
 
         # 推論実行
diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
index 493ef945..bea0f32e 100644
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@@ -26,7 +26,7 @@ class CrepePitchExtractor(PitchExtractor):
         f0_mel_min = 1127 * np.log(1 + f0_min / 700)
         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 
-        f0 = torchcrepe.predict(
+        f0, pd = torchcrepe.predict(
             audio.unsqueeze(0),
             sr,
             hop_length=window,
@@ -37,8 +37,11 @@ class CrepePitchExtractor(PitchExtractor):
             batch_size=256,
             decoder=torchcrepe.decode.weighted_argmax,
             device=self.device,
+            return_periodicity=True,
         )
-        f0 = torchcrepe.filter.median(f0, 3)
+        f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
+        pd = torchcrepe.filter.median(pd, 3)
+        f0[pd < 0.1] = 0
         f0 = f0.squeeze()
 
         f0 = torch.nn.functional.pad(