WIP: support rvc-webui

This commit is contained in:
wataru 2023-04-23 19:36:41 +09:00
parent 2fa33aad8d
commit d3823183d6
3 changed files with 40 additions and 22 deletions

View File

@ -32,7 +32,7 @@ import pyworld as pw
from voice_changer.RVC.custom_vc_infer_pipeline import VC
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMs768NSFsid
from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768, RVC_MODEL_TYPE_UNKNOWN
from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL, RVC_MODEL_TYPE_UNKNOWN
from fairseq import checkpoint_utils
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
@ -155,34 +155,49 @@ class RVC:
if pyTorchModelFile != None and pyTorchModelFile != "":
cpt = torch.load(pyTorchModelFile, map_location="cpu")
'''
ノーマル or Pitchレス判定 コンフィグのpsamplingrateの形状から判断
ノーマル
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000]
ピッチレス
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2],  512, [16, 16, 4, 4],109, 256, 40000]
12番目の要素upsamplingrateの数で判定4: ピッチレス, 5:ノーマル
256 or 768判定 config全体の形状
(1) オリジナルとrvc-webuiのモデル判定 config全体の形状
ーマル256
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000]
ノーマル 768対応
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000]
config全体の長さで判定 config全体の形状
18: オリジナル, 19: rvc-webui
(2-1) オリジナルのーマルorPitchレス判定 コンフィグのpsamplingrateの形状から判断
ノーマル
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000]
ピッチレス
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2],  512, [16, 16, 4, 4],109, 256, 40000]
12番目の要素upsamplingrateの数で判定4: ピッチレス, 5:ノーマル
(2-2) rvc-webuiの(256 or 768) x (ーマルor pitchレス)判定 256, or 768 は17番目の要素で判定
256 x ノーマル
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 256, 48000]
256 x pitchレス
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 256, 48000]
768 x ノーマル
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000]
768 x pitchレス
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000]
'''
print("config shape:::::", cpt["config"])
config_len = len(cpt["config"])
upsamplingRateDims = len(cpt["config"][12])
if config_len == 18 and upsamplingRateDims == 4:
print("[Voice Changer] RVC Model Type: Pitch-Less")
print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_PITCH_LESS")
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_PITCH_LESS
elif config_len == 18 and upsamplingRateDims == 5:
print("[Voice Changer] RVC Model Type: Normal")
print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_NORMAL")
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL
elif config_len == 19:
print("[Voice Changer] RVC Model Type: Normal_768")
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL_768
embedding = cpt["config"][17]
if embedding == 256:
print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_256_NORMAL")
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_256_NORMAL
else:
print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_768_NORMAL")
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_768_NORMAL
else:
print("[Voice Changer] RVC Model Type: UNKNOWN")
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_UNKNOWN
@ -193,7 +208,7 @@ class RVC:
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half)
elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_PITCH_LESS:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_NORMAL_768:
elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL:
net_g = SynthesizerTrnMs768NSFsid(**cpt["params"], is_half=self.is_half)
else:
print("unknwon")

View File

@ -1,4 +1,7 @@
RVC_MODEL_TYPE_NORMAL = 0
RVC_MODEL_TYPE_PITCH_LESS = 1
RVC_MODEL_TYPE_NORMAL_768 = 2
RVC_MODEL_TYPE_WEBUI_256_NORMAL = 2
RVC_MODEL_TYPE_WEBUI_256_PITCHLESS = 3
RVC_MODEL_TYPE_WEBUI_768_NORMAL = 4
RVC_MODEL_TYPE_WEBUI_768_PITCHLESS = 5
RVC_MODEL_TYPE_UNKNOWN = 99

View File

@ -10,7 +10,7 @@ import pyworld
import os
import traceback
import faiss
from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768
from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL
class VC(object):
@ -94,7 +94,7 @@ class VC(object):
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS:
if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL:
inputs = {
"source": feats.to(self.device),
"padding_mask": padding_mask,
@ -109,7 +109,7 @@ class VC(object):
t0 = ttime()
with torch.no_grad():
logits = model.extract_features(**inputs)
if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS:
if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL:
feats = model.final_proj(logits[0])
else:
feats = logits[0]
@ -136,7 +136,7 @@ class VC(object):
p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad():
if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_NORMAL_768:
if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL:
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
else:
audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)