WIP: diffusion svc rt badf0
This commit is contained in:
parent
9c829ac91a
commit
5bf1202215
@ -66,6 +66,11 @@ class EnumInferenceTypes(Enum):
|
|||||||
onnxRVCNono = "onnxRVCNono"
|
onnxRVCNono = "onnxRVCNono"
|
||||||
|
|
||||||
|
|
||||||
|
DiffusionSVCInferenceType: TypeAlias = Literal[
|
||||||
|
"combo",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
PitchExtractorType: TypeAlias = Literal[
|
PitchExtractorType: TypeAlias = Literal[
|
||||||
"harvest",
|
"harvest",
|
||||||
"dio",
|
"dio",
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from typing import TypeAlias, Union
|
from typing import TypeAlias, Union
|
||||||
from const import MAX_SLOT_NUM, EnumInferenceTypes, EmbedderType, VoiceChangerType
|
from const import MAX_SLOT_NUM, DiffusionSVCInferenceType, EnumInferenceTypes, EmbedderType, VoiceChangerType
|
||||||
|
|
||||||
from dataclasses import dataclass, asdict, field
|
from dataclasses import dataclass, asdict, field
|
||||||
|
|
||||||
@ -107,7 +107,7 @@ class DiffusionSVCModelSlot(ModelSlot):
|
|||||||
voiceChangerType: VoiceChangerType = "Diffusion-SVC"
|
voiceChangerType: VoiceChangerType = "Diffusion-SVC"
|
||||||
modelFile: str = ""
|
modelFile: str = ""
|
||||||
isONNX: bool = False
|
isONNX: bool = False
|
||||||
modelType: str = "combo"
|
modelType: DiffusionSVCInferenceType = "combo"
|
||||||
dstId: int = 1
|
dstId: int = 1
|
||||||
|
|
||||||
sampleId: str = ""
|
sampleId: str = ""
|
||||||
@ -115,6 +115,8 @@ class DiffusionSVCModelSlot(ModelSlot):
|
|||||||
kstep: int = 100
|
kstep: int = 100
|
||||||
speakers: dict = field(default_factory=lambda: {1: "user"})
|
speakers: dict = field(default_factory=lambda: {1: "user"})
|
||||||
embedder: EmbedderType = "hubert_base"
|
embedder: EmbedderType = "hubert_base"
|
||||||
|
samplingRate: int = 44100
|
||||||
|
embChannels: int = 768
|
||||||
|
|
||||||
|
|
||||||
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot]
|
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot]
|
||||||
|
@ -52,7 +52,7 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
print("[Voice Changer] [DiffusionSVC] Initializing... done")
|
print("[Voice Changer] [DiffusionSVC] Initializing... done")
|
||||||
|
|
||||||
def update_settings(self, key: str, val: int | float | str):
|
def update_settings(self, key: str, val: int | float | str):
|
||||||
print("[Voice Changer][RVC]: update_settings", key, val)
|
print("[Voice Changer][DiffusionSVC]: update_settings", key, val)
|
||||||
if key in self.settings.intData:
|
if key in self.settings.intData:
|
||||||
setattr(self.settings, key, int(val))
|
setattr(self.settings, key, int(val))
|
||||||
if key == "gpu":
|
if key == "gpu":
|
||||||
@ -86,19 +86,18 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
crossfadeSize: int,
|
crossfadeSize: int,
|
||||||
solaSearchFrame: int = 0,
|
solaSearchFrame: int = 0,
|
||||||
):
|
):
|
||||||
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
newData = newData.astype(np.float32) / 32768.0 # DiffusionSVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
||||||
|
|
||||||
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
|
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate # 100 は hubertのhosizeから (16000 / 160)
|
||||||
if self.audio_buffer is not None:
|
if self.audio_buffer is not None:
|
||||||
# 過去のデータに連結
|
# 過去のデータに連結
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
||||||
if self.slotInfo.f0:
|
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
|
||||||
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
|
print("^^^self.feature_buffer.shape, self.slotInfo.embChannels",self.feature_buffer.shape, self.slotInfo.embChannels)
|
||||||
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
|
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
|
||||||
else:
|
else:
|
||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
if self.slotInfo.f0:
|
self.pitchf_buffer = np.zeros(new_feature_length)
|
||||||
self.pitchf_buffer = np.zeros(new_feature_length)
|
|
||||||
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
||||||
|
|
||||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||||
@ -110,15 +109,13 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
# バッファがたまっていない場合はzeroで補う
|
# バッファがたまっていない場合はzeroで補う
|
||||||
if self.audio_buffer.shape[0] < convertSize:
|
if self.audio_buffer.shape[0] < convertSize:
|
||||||
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
|
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
|
||||||
if self.slotInfo.f0:
|
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
|
||||||
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
|
|
||||||
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
|
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
|
||||||
|
|
||||||
convertOffset = -1 * convertSize
|
convertOffset = -1 * convertSize
|
||||||
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
|
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
|
||||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||||
if self.slotInfo.f0:
|
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
|
||||||
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
|
|
||||||
self.feature_buffer = self.feature_buffer[featureOffset:]
|
self.feature_buffer = self.feature_buffer[featureOffset:]
|
||||||
|
|
||||||
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
||||||
@ -145,18 +142,18 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
if self.pipeline is not None:
|
if self.pipeline is not None:
|
||||||
device = self.pipeline.device
|
device = self.pipeline.device
|
||||||
else:
|
else:
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu") # TODO:pipelineが存在しない場合はzeroを返してもいいかも(要確認)。
|
||||||
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
|
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
|
||||||
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
|
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
|
||||||
repeat = 1 if self.settings.rvcQuality else 0
|
repeat = 0
|
||||||
sid = self.settings.dstId
|
sid = self.settings.dstId
|
||||||
f0_up_key = self.settings.tran
|
f0_up_key = self.settings.tran
|
||||||
index_rate = self.settings.indexRatio
|
index_rate = 0
|
||||||
protect = self.settings.protect
|
protect = 0
|
||||||
|
|
||||||
if_f0 = 1 if self.slotInfo.f0 else 0
|
if_f0 = 1
|
||||||
embOutputLayer = self.slotInfo.embOutputLayer
|
embOutputLayer = 12
|
||||||
useFinalProj = self.slotInfo.useFinalProj
|
useFinalProj = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||||
@ -167,14 +164,17 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
f0_up_key,
|
f0_up_key,
|
||||||
index_rate,
|
index_rate,
|
||||||
if_f0,
|
if_f0,
|
||||||
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
|
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
|
||||||
embOutputLayer,
|
embOutputLayer,
|
||||||
useFinalProj,
|
useFinalProj,
|
||||||
repeat,
|
repeat,
|
||||||
protect,
|
protect,
|
||||||
outSize
|
outSize
|
||||||
)
|
)
|
||||||
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||||
|
result = audio_out.detach().cpu().numpy()
|
||||||
|
|
||||||
|
print("RESULT", result)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||||
|
@ -21,6 +21,7 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
|
|||||||
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
||||||
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
||||||
slotInfo.iconFile = "/assets/icons/noimage.png"
|
slotInfo.iconFile = "/assets/icons/noimage.png"
|
||||||
|
slotInfo.embChannels = 768
|
||||||
|
|
||||||
# if slotInfo.isONNX:
|
# if slotInfo.isONNX:
|
||||||
# slotInfo = cls._setInfoByONNX(slotInfo)
|
# slotInfo = cls._setInfoByONNX(slotInfo)
|
||||||
|
@ -1,35 +1,134 @@
|
|||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
||||||
|
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
|
||||||
|
|
||||||
class RVCInferencer(Inferencer):
|
class DiffusionSVCInferencer(Inferencer):
|
||||||
|
def __init__(self):
|
||||||
|
self.diff_model: Unit2Mel | None = None
|
||||||
|
self.naive_model: Unit2MelNaive | None = None
|
||||||
|
self.vocoder: Vocoder | None = None
|
||||||
|
|
||||||
def loadModel(self, file: str, gpu: int):
|
def loadModel(self, file: str, gpu: int):
|
||||||
self.setProps("DiffusionSVCCombo", file, True, gpu)
|
self.setProps("DiffusionSVCCombo", file, True, gpu)
|
||||||
|
|
||||||
dev = DeviceManager.get_instance().getDevice(gpu)
|
self.dev = DeviceManager.get_instance().getDevice(gpu)
|
||||||
isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
# isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||||
|
|
||||||
cpt = torch.load(file, map_location="cpu")
|
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev)
|
||||||
model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
self.diff_model = diff_model
|
||||||
|
self.naive_model = naive_model
|
||||||
|
self.vocoder = vocoder
|
||||||
|
self.diff_args = diff_args
|
||||||
|
print("-----------------> diff_args", diff_args)
|
||||||
|
print("-----------------> naive_args", naive_args)
|
||||||
|
|
||||||
model.eval()
|
# cpt = torch.load(file, map_location="cpu")
|
||||||
model.load_state_dict(cpt["weight"], strict=False)
|
# model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
||||||
|
|
||||||
model = model.to(dev)
|
# model.eval()
|
||||||
if isHalf:
|
# model.load_state_dict(cpt["weight"], strict=False)
|
||||||
model = model.half()
|
|
||||||
|
|
||||||
self.model = model
|
# model = model.to(dev)
|
||||||
|
# if isHalf:
|
||||||
|
# model = model.half()
|
||||||
|
|
||||||
|
# self.model = model
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def getConfig(self) -> tuple[int, int]:
|
||||||
|
model_sampling_rate = int(self.diff_args.data.sampling_rate)
|
||||||
|
model_block_size = int(self.diff_args.data.block_size)
|
||||||
|
return model_block_size, model_sampling_rate
|
||||||
|
|
||||||
|
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
|
||||||
|
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||||
|
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||||
|
spk_emb=None):
|
||||||
|
|
||||||
|
if self.diff_args.model.k_step_max is not None:
|
||||||
|
if k_step is None:
|
||||||
|
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
|
||||||
|
if k_step > int(self.diff_args.model.k_step_max):
|
||||||
|
raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model")
|
||||||
|
if gt_spec is None:
|
||||||
|
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
|
||||||
|
"input mel or output of naive model")
|
||||||
|
print(f' [INFO] k_step_max is {self.diff_args.model.k_step_max}.')
|
||||||
|
|
||||||
|
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
||||||
|
|
||||||
|
# spk_id
|
||||||
|
spk_emb_dict = None
|
||||||
|
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
||||||
|
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||||
|
# without speaker encoder
|
||||||
|
else:
|
||||||
|
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
|
||||||
|
|
||||||
|
if k_step is not None:
|
||||||
|
print(f' [INFO] get k_step, do shallow diffusion {k_step} steps')
|
||||||
|
else:
|
||||||
|
print(f' [INFO] Do full 1000 steps depth diffusion {k_step}')
|
||||||
|
print(f" [INFO] method:{method}; infer_speedup:{infer_speedup}")
|
||||||
|
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
|
||||||
|
aug_shift=0, spk_emb=None):
|
||||||
|
# spk_id
|
||||||
|
spk_emb_dict = None
|
||||||
|
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
||||||
|
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||||
|
# without speaker encoder
|
||||||
|
else:
|
||||||
|
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
|
||||||
|
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
||||||
|
print("====> unit, f0, vol", units.shape, f0.shape, volume.shape)
|
||||||
|
print("====> *unit, f0, vol", units)
|
||||||
|
print("====> unit, *f0, vol", f0)
|
||||||
|
print("====> unit, f0, *vol", volume)
|
||||||
|
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||||
|
aug_shift=aug_shift, infer=True,
|
||||||
|
spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||||
|
return out_spec
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def mel2wav(self, mel, f0, start_frame=0):
|
||||||
|
if start_frame == 0:
|
||||||
|
return self.vocoder.infer(mel, f0)
|
||||||
|
else: # for realtime speedup
|
||||||
|
mel = mel[:, start_frame:, :]
|
||||||
|
f0 = f0[:, start_frame:, :]
|
||||||
|
out_wav = self.vocoder.infer(mel, f0)
|
||||||
|
return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0))
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
def infer(
|
def infer(
|
||||||
self,
|
self,
|
||||||
feats: torch.Tensor,
|
feats: torch.Tensor,
|
||||||
pitch_length: torch.Tensor,
|
|
||||||
pitch: torch.Tensor,
|
pitch: torch.Tensor,
|
||||||
pitchf: torch.Tensor,
|
volume: torch.Tensor,
|
||||||
|
mask: torch.Tensor,
|
||||||
sid: torch.Tensor,
|
sid: torch.Tensor,
|
||||||
convert_length: int | None,
|
infer_speedup: int,
|
||||||
|
k_step: int,
|
||||||
|
silence_front: float,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
|
print("---------------------------------shape", feats.shape, pitch.shape, volume.shape)
|
||||||
|
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
||||||
|
print("======================>>>>>gt_spec", gt_spec)
|
||||||
|
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
|
||||||
|
print("======================>>>>>out_mel", out_mel)
|
||||||
|
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||||
|
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||||
|
|
||||||
|
print("======================>>>>>out_wav.shape, mask.shape", out_wav.shape, mask.shape)
|
||||||
|
out_wav *= mask
|
||||||
|
print("out_wav:::::::::::", out_wav)
|
||||||
|
return out_wav.squeeze()
|
||||||
|
50
server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
Normal file
50
server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
from typing import Any, Protocol
|
||||||
|
import torch
|
||||||
|
import onnxruntime
|
||||||
|
|
||||||
|
from const import DiffusionSVCInferenceType
|
||||||
|
|
||||||
|
|
||||||
|
class Inferencer(Protocol):
|
||||||
|
inferencerType: DiffusionSVCInferenceType = "combo"
|
||||||
|
file: str
|
||||||
|
isHalf: bool = True
|
||||||
|
gpu: int = 0
|
||||||
|
|
||||||
|
model: onnxruntime.InferenceSession | Any | None = None
|
||||||
|
|
||||||
|
def loadModel(self, file: str, gpu: int):
|
||||||
|
...
|
||||||
|
|
||||||
|
def getConfig(self) -> tuple[int, int]:
|
||||||
|
...
|
||||||
|
|
||||||
|
def infer(
|
||||||
|
self,
|
||||||
|
feats: torch.Tensor,
|
||||||
|
pitch_length: torch.Tensor,
|
||||||
|
pitch: torch.Tensor | None,
|
||||||
|
pitchf: torch.Tensor | None,
|
||||||
|
sid: torch.Tensor,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
...
|
||||||
|
|
||||||
|
def setProps(
|
||||||
|
self,
|
||||||
|
inferencerType: DiffusionSVCInferenceType,
|
||||||
|
file: str,
|
||||||
|
isHalf: bool,
|
||||||
|
gpu: int,
|
||||||
|
):
|
||||||
|
self.inferencerType = inferencerType
|
||||||
|
self.file = file
|
||||||
|
self.isHalf = isHalf
|
||||||
|
self.gpu = gpu
|
||||||
|
|
||||||
|
def getInferencerInfo(self):
|
||||||
|
return {
|
||||||
|
"inferencerType": self.inferencerType,
|
||||||
|
"file": self.file,
|
||||||
|
"isHalf": self.isHalf,
|
||||||
|
"gpu": self.gpu,
|
||||||
|
}
|
@ -0,0 +1,29 @@
|
|||||||
|
from const import DiffusionSVCInferenceType
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer
|
||||||
|
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||||
|
|
||||||
|
|
||||||
|
class InferencerManager:
|
||||||
|
currentInferencer: Inferencer | None = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getInferencer(
|
||||||
|
cls,
|
||||||
|
inferencerType: DiffusionSVCInferenceType,
|
||||||
|
file: str,
|
||||||
|
gpu: int,
|
||||||
|
) -> Inferencer:
|
||||||
|
cls.currentInferencer = cls.loadInferencer(inferencerType, file, gpu)
|
||||||
|
return cls.currentInferencer
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def loadInferencer(
|
||||||
|
cls,
|
||||||
|
inferencerType: DiffusionSVCInferenceType,
|
||||||
|
file: str,
|
||||||
|
gpu: int,
|
||||||
|
) -> Inferencer:
|
||||||
|
if inferencerType == "combo":
|
||||||
|
return DiffusionSVCInferencer().loadModel(file, gpu)
|
||||||
|
else:
|
||||||
|
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)
|
@ -38,21 +38,16 @@ class DiffusionSVC:
|
|||||||
self.use_combo_model = False
|
self.use_combo_model = False
|
||||||
|
|
||||||
def load_model(self, model_path, f0_model=None, f0_min=None, f0_max=None):
|
def load_model(self, model_path, f0_model=None, f0_min=None, f0_max=None):
|
||||||
|
self.use_combo_model = True
|
||||||
if ('1234' + model_path)[-4:] == '.ptc':
|
self.model_path = model_path
|
||||||
self.use_combo_model = True
|
self.naive_model_path = model_path
|
||||||
self.model_path = model_path
|
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(model_path,
|
||||||
self.naive_model_path = model_path
|
device=self.device)
|
||||||
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(model_path,
|
self.model = diff_model
|
||||||
device=self.device)
|
self.args = diff_args
|
||||||
self.model = diff_model
|
self.naive_model = naive_model
|
||||||
self.args = diff_args
|
self.naive_model_args = naive_args
|
||||||
self.naive_model = naive_model
|
self.vocoder = vocoder
|
||||||
self.naive_model_args = naive_args
|
|
||||||
self.vocoder = vocoder
|
|
||||||
else:
|
|
||||||
self.model_path = model_path
|
|
||||||
self.model, self.vocoder, self.args = load_model_vocoder(model_path, device=self.device)
|
|
||||||
|
|
||||||
self.units_encoder = Units_Encoder(
|
self.units_encoder = Units_Encoder(
|
||||||
self.args.data.encoder,
|
self.args.data.encoder,
|
||||||
@ -85,33 +80,6 @@ class DiffusionSVC:
|
|||||||
|
|
||||||
self.units_indexer = UnitsIndexer(os.path.split(model_path)[0])
|
self.units_indexer = UnitsIndexer(os.path.split(model_path)[0])
|
||||||
|
|
||||||
def flush(self, model_path=None, f0_model=None, f0_min=None, f0_max=None, naive_model_path=None):
|
|
||||||
assert (model_path is not None) or (naive_model_path is not None)
|
|
||||||
# flush model if changed
|
|
||||||
if ((self.model_path != model_path) or (self.f0_model != f0_model)
|
|
||||||
or (self.f0_min != f0_min) or (self.f0_max != f0_max)):
|
|
||||||
self.load_model(model_path, f0_model=f0_model, f0_min=f0_min, f0_max=f0_max)
|
|
||||||
if (self.naive_model_path != naive_model_path) and (naive_model_path is not None):
|
|
||||||
self.load_naive_model(naive_model_path)
|
|
||||||
# check args if use naive
|
|
||||||
if self.naive_model is not None:
|
|
||||||
if self.naive_model_args.data.encoder != self.args.data.encoder:
|
|
||||||
raise ValueError("encoder of Naive Model and Diffusion Model are different")
|
|
||||||
if self.naive_model_args.model.n_spk != self.args.model.n_spk:
|
|
||||||
raise ValueError("n_spk of Naive Model and Diffusion Model are different")
|
|
||||||
if bool(self.naive_model_args.model.use_speaker_encoder) != bool(self.args.model.use_speaker_encoder):
|
|
||||||
raise ValueError("use_speaker_encoder of Naive Model and Diffusion Model are different")
|
|
||||||
if self.naive_model_args.vocoder.type != self.args.vocoder.type:
|
|
||||||
raise ValueError("vocoder of Naive Model and Diffusion Model are different")
|
|
||||||
if self.naive_model_args.data.block_size != self.args.data.block_size:
|
|
||||||
raise ValueError("block_size of Naive Model and Diffusion Model are different")
|
|
||||||
if self.naive_model_args.data.sampling_rate != self.args.data.sampling_rate:
|
|
||||||
raise ValueError("sampling_rate of Naive Model and Diffusion Model are different")
|
|
||||||
|
|
||||||
def flush_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
|
||||||
if (f0_model != self.f0_model) and (f0_model is not None):
|
|
||||||
self.load_f0_extractor(f0_model)
|
|
||||||
|
|
||||||
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
||||||
self.f0_model = f0_model if (f0_model is not None) else self.args.data.f0_extractor
|
self.f0_model = f0_model if (f0_model is not None) else self.args.data.f0_extractor
|
||||||
self.f0_min = f0_min if (f0_min is not None) else self.args.data.f0_min
|
self.f0_min = f0_min if (f0_min is not None) else self.args.data.f0_min
|
||||||
@ -127,12 +95,6 @@ class DiffusionSVC:
|
|||||||
model_sampling_rate=self.args.data.sampling_rate
|
model_sampling_rate=self.args.data.sampling_rate
|
||||||
)
|
)
|
||||||
|
|
||||||
def load_naive_model(self, naive_model_path):
|
|
||||||
self.naive_model_path = naive_model_path
|
|
||||||
model, _, args = load_model_vocoder(naive_model_path, device=self.device, loaded_vocoder=self.vocoder)
|
|
||||||
self.naive_model = model
|
|
||||||
self.naive_model_args = args
|
|
||||||
print(f" [INFO] Load naive model from {naive_model_path}")
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
|
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
|
||||||
@ -265,144 +227,6 @@ class DiffusionSVC:
|
|||||||
gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||||
use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||||
|
|
||||||
@torch.no_grad() # 比__call__多了声码器代码,输出波形
|
|
||||||
def infer(self, units, f0, volume, gt_spec=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
|
||||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
|
||||||
spk_emb=None):
|
|
||||||
if k_step is not None:
|
|
||||||
if self.naive_model is not None:
|
|
||||||
gt_spec = self.naive_model_call(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
|
||||||
aug_shift=aug_shift, spk_emb=spk_emb)
|
|
||||||
print(f" [INFO] get mel from naive model out.")
|
|
||||||
assert gt_spec is not None
|
|
||||||
if self.naive_model is None:
|
|
||||||
print(f" [INFO] get mel from input wav.")
|
|
||||||
if input(" [WARN] You are attempting shallow diffusion "
|
|
||||||
"on the mel of the input source,"
|
|
||||||
" Please enter 'gt_mel' to continue") != 'gt_mel':
|
|
||||||
raise ValueError("Please understand what you're doing")
|
|
||||||
k_step = int(k_step)
|
|
||||||
gt_spec = gt_spec
|
|
||||||
else:
|
|
||||||
gt_spec = None
|
|
||||||
|
|
||||||
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
|
||||||
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
|
||||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
|
||||||
return self.mel2wav(out_mel, f0)
|
|
||||||
|
|
||||||
@torch.no_grad() # 为实时浅扩散优化的推理代码,可以切除pad省算力
|
|
||||||
def infer_for_realtime(self, units, f0, volume, audio_t=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
|
||||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
|
||||||
spk_emb=None, silence_front=0, diff_jump_silence_front=False):
|
|
||||||
|
|
||||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
|
||||||
|
|
||||||
if diff_jump_silence_front:
|
|
||||||
if audio_t is not None:
|
|
||||||
audio_t = audio_t[:, start_frame * self.vocoder.vocoder_hop_size:]
|
|
||||||
f0 = f0[:, start_frame:, :]
|
|
||||||
units = units[:, start_frame:, :]
|
|
||||||
volume = volume[:, start_frame:, :]
|
|
||||||
|
|
||||||
if k_step is not None:
|
|
||||||
assert audio_t is not None
|
|
||||||
k_step = int(k_step)
|
|
||||||
gt_spec = self.vocoder.extract(audio_t, self.args.data.sampling_rate)
|
|
||||||
# 如果缺帧再开这行gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
|
||||||
else:
|
|
||||||
gt_spec = None
|
|
||||||
|
|
||||||
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
|
||||||
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
|
||||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
|
||||||
|
|
||||||
if diff_jump_silence_front:
|
|
||||||
out_wav = self.mel2wav(out_mel, f0)
|
|
||||||
else:
|
|
||||||
out_wav = self.mel2wav(out_mel, f0, start_frame=start_frame)
|
|
||||||
return out_wav
|
|
||||||
|
|
||||||
@torch.no_grad() # 不切片从音频推理代码
|
|
||||||
def infer_from_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
|
||||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
|
||||||
spk_emb=None, threhold=-60, index_ratio=0):
|
|
||||||
units = self.encode_units(audio, sr)
|
|
||||||
if index_ratio > 0:
|
|
||||||
units = self.units_indexer(units_t=units, spk_id=spk_id, ratio=index_ratio)
|
|
||||||
f0 = self.extract_f0(audio, key=key, sr=sr)
|
|
||||||
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
|
|
||||||
if k_step is not None:
|
|
||||||
assert 0 < int(k_step) <= 1000
|
|
||||||
k_step = int(k_step)
|
|
||||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
|
||||||
gt_spec = self.vocoder.extract(audio_t, sr)
|
|
||||||
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
|
||||||
else:
|
|
||||||
gt_spec = None
|
|
||||||
output = self.infer(units, f0, volume, gt_spec=gt_spec, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
|
||||||
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
|
||||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
|
||||||
output *= mask
|
|
||||||
return output.squeeze().cpu().numpy(), self.args.data.sampling_rate
|
|
||||||
|
|
||||||
@torch.no_grad() # 切片从音频推理代码
|
|
||||||
def infer_from_long_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
|
||||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
|
||||||
spk_emb=None,
|
|
||||||
threhold=-60, threhold_for_split=-40, min_len=5000, index_ratio=0):
|
|
||||||
|
|
||||||
hop_size = self.args.data.block_size * sr / self.args.data.sampling_rate
|
|
||||||
segments = split(audio, sr, hop_size, db_thresh=threhold_for_split, min_len=min_len)
|
|
||||||
|
|
||||||
print(f' [INFO] Extract f0 volume and mask: Use {self.f0_model}, start...')
|
|
||||||
_f0_start_time = time.time()
|
|
||||||
f0 = self.extract_f0(audio, key=key, sr=sr)
|
|
||||||
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
|
|
||||||
_f0_end_time = time.time()
|
|
||||||
_f0_used_time = _f0_end_time - _f0_start_time
|
|
||||||
print(f' [INFO] Extract f0 volume and mask: Done. Use time:{_f0_used_time}')
|
|
||||||
|
|
||||||
if k_step is not None:
|
|
||||||
assert 0 < int(k_step) <= 1000
|
|
||||||
k_step = int(k_step)
|
|
||||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
|
||||||
gt_spec = self.vocoder.extract(audio_t, sr)
|
|
||||||
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
|
||||||
else:
|
|
||||||
gt_spec = None
|
|
||||||
|
|
||||||
result = np.zeros(0)
|
|
||||||
current_length = 0
|
|
||||||
for segment in tqdm(segments):
|
|
||||||
start_frame = segment[0]
|
|
||||||
seg_input = torch.from_numpy(segment[1]).float().unsqueeze(0).to(self.device)
|
|
||||||
seg_units = self.units_encoder.encode(seg_input, sr, hop_size)
|
|
||||||
if index_ratio > 0:
|
|
||||||
seg_units = self.units_indexer(units_t=seg_units, spk_id=spk_id, ratio=index_ratio)
|
|
||||||
seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :]
|
|
||||||
seg_volume = volume[:, start_frame: start_frame + seg_units.size(1), :]
|
|
||||||
if gt_spec is not None:
|
|
||||||
seg_gt_spec = gt_spec[:, start_frame: start_frame + seg_units.size(1), :]
|
|
||||||
else:
|
|
||||||
seg_gt_spec = None
|
|
||||||
seg_output = self.infer(seg_units, seg_f0, seg_volume, gt_spec=seg_gt_spec, spk_id=spk_id,
|
|
||||||
spk_mix_dict=spk_mix_dict,
|
|
||||||
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
|
||||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
|
||||||
_left = start_frame * self.args.data.block_size
|
|
||||||
_right = (start_frame + seg_units.size(1)) * self.args.data.block_size
|
|
||||||
seg_output *= mask[:, _left:_right]
|
|
||||||
seg_output = seg_output.squeeze().cpu().numpy()
|
|
||||||
silent_length = round(start_frame * self.args.data.block_size) - current_length
|
|
||||||
if silent_length >= 0:
|
|
||||||
result = np.append(result, np.zeros(silent_length))
|
|
||||||
result = np.append(result, seg_output)
|
|
||||||
else:
|
|
||||||
result = cross_fade(result, seg_output, current_length + silent_length)
|
|
||||||
current_length = current_length + silent_length + len(seg_output)
|
|
||||||
|
|
||||||
return result, self.args.data.sampling_rate
|
|
||||||
|
|
||||||
@torch.no_grad() # 为实时优化的推理代码,可以切除pad省算力
|
@torch.no_grad() # 为实时优化的推理代码,可以切除pad省算力
|
||||||
def infer_from_audio_for_realtime(self, audio, sr, key, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
def infer_from_audio_for_realtime(self, audio, sr, key, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||||
|
@ -252,7 +252,7 @@ class GaussianDiffusion(nn.Module):
|
|||||||
|
|
||||||
if method is not None and infer_speedup > 1:
|
if method is not None and infer_speedup > 1:
|
||||||
if method == 'dpm-solver':
|
if method == 'dpm-solver':
|
||||||
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
|
||||||
# 1. Define the noise schedule.
|
# 1. Define the noise schedule.
|
||||||
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -56,6 +56,7 @@ class Unit2MelNaive(nn.Module):
|
|||||||
residual_dropout=0.1,
|
residual_dropout=0.1,
|
||||||
attention_dropout=0.1)
|
attention_dropout=0.1)
|
||||||
else:
|
else:
|
||||||
|
print("[[[[[PCmer]]]]]")
|
||||||
self.decoder = PCmer(
|
self.decoder = PCmer(
|
||||||
num_layers=n_layers,
|
num_layers=n_layers,
|
||||||
num_heads=8,
|
num_heads=8,
|
||||||
@ -81,8 +82,9 @@ class Unit2MelNaive(nn.Module):
|
|||||||
return:
|
return:
|
||||||
dict of B x n_frames x feat
|
dict of B x n_frames x feat
|
||||||
'''
|
'''
|
||||||
x = self.stack(units.transpose(1,2)).transpose(1,2)
|
x = self.stack(units.transpose(1, 2)).transpose(1, 2)
|
||||||
x = x + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
|
x = x + self.f0_embed((1 + f0 / 700).log()) + self.volume_embed(volume)
|
||||||
|
print("-----------------x1>", x)
|
||||||
if self.use_speaker_encoder:
|
if self.use_speaker_encoder:
|
||||||
if spk_mix_dict is not None:
|
if spk_mix_dict is not None:
|
||||||
assert spk_emb_dict is not None
|
assert spk_emb_dict is not None
|
||||||
@ -104,9 +106,13 @@ class Unit2MelNaive(nn.Module):
|
|||||||
if self.aug_shift_embed is not None and aug_shift is not None:
|
if self.aug_shift_embed is not None and aug_shift is not None:
|
||||||
x = x + self.aug_shift_embed(aug_shift / 5)
|
x = x + self.aug_shift_embed(aug_shift / 5)
|
||||||
|
|
||||||
|
print("-----------------x2>", x)
|
||||||
x = self.decoder(x)
|
x = self.decoder(x)
|
||||||
|
print("-----------------x3>", x)
|
||||||
x = self.norm(x)
|
x = self.norm(x)
|
||||||
|
print("-----------------x4>", x)
|
||||||
x = self.dense_out(x)
|
x = self.dense_out(x)
|
||||||
|
print("-----------------x5>", x)
|
||||||
if not infer:
|
if not infer:
|
||||||
x = F.mse_loss(x, gt_spec)
|
x = F.mse_loss(x, gt_spec)
|
||||||
if self.l2reg_loss > 0:
|
if self.l2reg_loss > 0:
|
||||||
|
@ -94,9 +94,12 @@ class PCmer(nn.Module):
|
|||||||
def forward(self, phone, mask=None):
|
def forward(self, phone, mask=None):
|
||||||
|
|
||||||
# apply all layers to the input
|
# apply all layers to the input
|
||||||
|
print("[[[[[PCmer]]]]1]", phone, mask)
|
||||||
for (i, layer) in enumerate(self._layers):
|
for (i, layer) in enumerate(self._layers):
|
||||||
phone = layer(phone, mask)
|
phone = layer(phone, mask)
|
||||||
|
# print("[[[[[PCmer]]]] 2 ]", phone)
|
||||||
# provide the final sequence
|
# provide the final sequence
|
||||||
|
print("[[[[[PCmer]]]]3]", phone)
|
||||||
return phone
|
return phone
|
||||||
|
|
||||||
|
|
||||||
@ -136,9 +139,13 @@ class _EncoderLayer(nn.Module):
|
|||||||
def forward(self, phone, mask=None):
|
def forward(self, phone, mask=None):
|
||||||
|
|
||||||
# compute attention sub-layer
|
# compute attention sub-layer
|
||||||
|
print("Phone:::::1:", phone)
|
||||||
|
print("Phone:::::16:", self.norm(phone))
|
||||||
phone = phone + (self.attn(self.norm(phone), mask=mask))
|
phone = phone + (self.attn(self.norm(phone), mask=mask))
|
||||||
|
print("Phone:::::2:", phone)
|
||||||
|
|
||||||
phone = phone + (self.conformer(phone))
|
phone = phone + (self.conformer(phone))
|
||||||
|
print("Phone:::::3:", phone)
|
||||||
|
|
||||||
return phone
|
return phone
|
||||||
|
|
||||||
|
@ -3,10 +3,10 @@ import yaml
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from .diffusion import GaussianDiffusion
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.diffusion import GaussianDiffusion
|
||||||
from .wavenet import WaveNet
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.wavenet import WaveNet
|
||||||
from .vocoder import Vocoder
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
||||||
from .naive.naive import Unit2MelNaive
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
|
||||||
|
|
||||||
|
|
||||||
class DotDict(dict):
|
class DotDict(dict):
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import torch
|
import torch
|
||||||
from nsf_hifigan.nvSTFT import STFT
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.nsf_hifigan.nvSTFT import STFT
|
||||||
from nsf_hifigan.models import load_model, load_config
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.nsf_hifigan.models import load_model, load_config
|
||||||
from torchaudio.transforms import Resample
|
from torchaudio.transforms import Resample
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import torch.nn as nn
|
|
||||||
import pyworld as pw
|
import pyworld as pw
|
||||||
import parselmouth
|
import parselmouth
|
||||||
import torchcrepe
|
import torchcrepe
|
||||||
@ -789,15 +789,6 @@ def median_pool_1d(x, kernel_size):
|
|||||||
x, _ = torch.sort(x, dim=-1)
|
x, _ = torch.sort(x, dim=-1)
|
||||||
return x[:, :, (kernel_size - 1) // 2]
|
return x[:, :, (kernel_size - 1) // 2]
|
||||||
|
|
||||||
|
|
||||||
def upsample(signal, factor):
|
|
||||||
signal = signal.permute(0, 2, 1)
|
|
||||||
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1,
|
|
||||||
mode='linear', align_corners=True)
|
|
||||||
signal = signal[:, :, :-1]
|
|
||||||
return signal.permute(0, 2, 1)
|
|
||||||
|
|
||||||
|
|
||||||
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
|
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
|
||||||
result = np.zeros(idx + b.shape[0])
|
result = np.zeros(idx + b.shape[0])
|
||||||
fade_len = a.shape[0] - idx
|
fade_len = a.shape[0] - idx
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import numpy as np
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
import math
|
import math
|
||||||
import torch
|
import torch
|
||||||
@ -10,13 +9,14 @@ from Exceptions import (
|
|||||||
HalfPrecisionChangingException,
|
HalfPrecisionChangingException,
|
||||||
NotEnoughDataExtimateF0,
|
NotEnoughDataExtimateF0,
|
||||||
)
|
)
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
||||||
|
|
||||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
|
||||||
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
|
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
|
||||||
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
|
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
|
||||||
|
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(object):
|
class Pipeline(object):
|
||||||
@ -37,29 +37,30 @@ class Pipeline(object):
|
|||||||
embedder: Embedder,
|
embedder: Embedder,
|
||||||
inferencer: Inferencer,
|
inferencer: Inferencer,
|
||||||
pitchExtractor: PitchExtractor,
|
pitchExtractor: PitchExtractor,
|
||||||
index: Any | None,
|
# index: Any | None,
|
||||||
# feature: Any | None,
|
|
||||||
targetSR,
|
targetSR,
|
||||||
device,
|
device,
|
||||||
isHalf,
|
isHalf,
|
||||||
):
|
):
|
||||||
|
model_block_size, model_sampling_rate = inferencer.getConfig()
|
||||||
|
self.hop_size = model_block_size * 16000 / model_sampling_rate # 16000はオーディオのサンプルレート。この時点で16Kになっている。
|
||||||
|
|
||||||
|
self.volumeExtractor = VolumeExtractor(self.hop_size, model_block_size, model_sampling_rate, audio_sampling_rate=16000)
|
||||||
self.embedder = embedder
|
self.embedder = embedder
|
||||||
|
|
||||||
self.inferencer = inferencer
|
self.inferencer = inferencer
|
||||||
self.pitchExtractor = pitchExtractor
|
self.pitchExtractor = pitchExtractor
|
||||||
print("GENERATE INFERENCER", self.inferencer)
|
print("GENERATE INFERENCER", self.inferencer)
|
||||||
print("GENERATE EMBEDDER", self.embedder)
|
print("GENERATE EMBEDDER", self.embedder)
|
||||||
print("GENERATE PITCH EXTRACTOR", self.pitchExtractor)
|
print("GENERATE PITCH EXTRACTOR", self.pitchExtractor)
|
||||||
|
|
||||||
self.index = index
|
|
||||||
self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
|
|
||||||
# self.feature = feature
|
|
||||||
|
|
||||||
self.targetSR = targetSR
|
self.targetSR = targetSR
|
||||||
self.device = device
|
self.device = device
|
||||||
self.isHalf = isHalf
|
# self.isHalf = isHalf
|
||||||
|
self.isHalf = False
|
||||||
|
|
||||||
self.sr = 16000
|
# self.sr = 16000
|
||||||
self.window = 160
|
# self.window = 160
|
||||||
|
|
||||||
def getPipelineInfo(self):
|
def getPipelineInfo(self):
|
||||||
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
||||||
@ -70,6 +71,13 @@ class Pipeline(object):
|
|||||||
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
|
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
|
||||||
self.pitchExtractor = pitchExtractor
|
self.pitchExtractor = pitchExtractor
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def extract_volume_and_mask(self, audio, threhold):
|
||||||
|
volume = self.volumeExtractor.extract(audio)
|
||||||
|
mask = self.volumeExtractor.get_mask_from_volume(volume, threhold=threhold, device=self.device)
|
||||||
|
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||||
|
return volume, mask
|
||||||
|
|
||||||
def exec(
|
def exec(
|
||||||
self,
|
self,
|
||||||
sid,
|
sid,
|
||||||
@ -87,56 +95,45 @@ class Pipeline(object):
|
|||||||
out_size=None,
|
out_size=None,
|
||||||
):
|
):
|
||||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||||
|
|
||||||
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
|
|
||||||
# self.t_pad = self.sr * repeat # 1秒
|
|
||||||
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
|
|
||||||
audio = audio.unsqueeze(0)
|
audio = audio.unsqueeze(0)
|
||||||
|
self.t_pad = 0
|
||||||
quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
|
|
||||||
|
|
||||||
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
|
|
||||||
self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
|
|
||||||
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
||||||
p_len = audio_pad.shape[0] // self.window
|
|
||||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||||
|
|
||||||
# RVC QualityがOnのときにはsilence_frontをオフに。
|
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
|
||||||
silence_front = silence_front if repeat == 0 else 0
|
print("--------------------> n_frames:", n_frames)
|
||||||
pitchf = pitchf if repeat == 0 else np.zeros(p_len)
|
|
||||||
out_size = out_size if repeat == 0 else None
|
|
||||||
|
|
||||||
|
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
|
||||||
|
print("--------------------> volume:", volume.shape)
|
||||||
# ピッチ検出
|
# ピッチ検出
|
||||||
try:
|
try:
|
||||||
if if_f0 == 1:
|
pitch, pitchf = self.pitchExtractor.extract(
|
||||||
pitch, pitchf = self.pitchExtractor.extract(
|
audio_pad,
|
||||||
audio_pad,
|
pitchf,
|
||||||
pitchf,
|
f0_up_key,
|
||||||
f0_up_key,
|
16000, # 音声のサンプリングレート(既に16000)
|
||||||
self.sr,
|
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||||
self.window,
|
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||||
silence_front=silence_front,
|
silence_front=silence_front,
|
||||||
)
|
)
|
||||||
# pitch = pitch[:p_len]
|
print("--------------------> pitch11111111111111111111111111111111:", pitch[1:], pitch.shape)
|
||||||
# pitchf = pitchf[:p_len]
|
|
||||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。
|
||||||
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
|
pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。
|
||||||
else:
|
except IndexError as e:
|
||||||
pitch = None
|
print(e)
|
||||||
pitchf = None
|
|
||||||
except IndexError:
|
|
||||||
# print(e)
|
# print(e)
|
||||||
raise NotEnoughDataExtimateF0()
|
raise NotEnoughDataExtimateF0()
|
||||||
|
|
||||||
|
print("--------------------> pitch:", pitch, pitch.shape)
|
||||||
|
|
||||||
# tensor型調整
|
# tensor型調整
|
||||||
feats = audio_pad
|
feats = audio_pad
|
||||||
if feats.dim() == 2: # double channels
|
if feats.dim() == 2: # double channels
|
||||||
feats = feats.mean(-1)
|
feats = feats.mean(-1)
|
||||||
assert feats.dim() == 1, feats.dim()
|
|
||||||
feats = feats.view(1, -1)
|
feats = feats.view(1, -1)
|
||||||
|
|
||||||
# embedding
|
# embedding
|
||||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
|
||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
try:
|
try:
|
||||||
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
||||||
@ -149,74 +146,46 @@ class Pipeline(object):
|
|||||||
raise DeviceChangingException()
|
raise DeviceChangingException()
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
if protect < 0.5 and search_index:
|
|
||||||
feats0 = feats.clone()
|
|
||||||
|
|
||||||
# Index - feature抽出
|
print("--------------------> feats1:", feats, feats.shape)
|
||||||
# if self.index is not None and self.feature is not None and index_rate != 0:
|
|
||||||
if search_index:
|
|
||||||
npy = feats[0].cpu().numpy()
|
|
||||||
# apply silent front for indexsearch
|
|
||||||
npyOffset = math.floor(silence_front * 16000) // 360
|
|
||||||
npy = npy[npyOffset:]
|
|
||||||
|
|
||||||
if self.isHalf is True:
|
# feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||||
npy = npy.astype("float32")
|
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
|
||||||
|
|
||||||
# TODO: kは調整できるようにする
|
if protect < 0.5:
|
||||||
k = 1
|
feats0 = feats.clone()
|
||||||
if k == 1:
|
print("--------------------> feats2:", feats, feats.shape)
|
||||||
_, ix = self.index.search(npy, 1)
|
|
||||||
npy = self.big_npy[ix.squeeze()]
|
|
||||||
else:
|
|
||||||
score, ix = self.index.search(npy, k=8)
|
|
||||||
weight = np.square(1 / score)
|
|
||||||
weight /= weight.sum(axis=1, keepdims=True)
|
|
||||||
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
|
||||||
|
|
||||||
# recover silient font
|
# # ピッチサイズ調整
|
||||||
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
|
# p_len = audio_pad.shape[0] // self.window
|
||||||
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
|
# feats_len = feats.shape[1]
|
||||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
# if feats.shape[1] < p_len:
|
||||||
if protect < 0.5 and search_index:
|
# p_len = feats_len
|
||||||
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
# pitch = pitch[:, :feats_len]
|
||||||
|
# pitchf = pitchf[:, :feats_len]
|
||||||
|
|
||||||
# ピッチサイズ調整
|
# pitch = pitch[:, -feats_len:]
|
||||||
p_len = audio_pad.shape[0] // self.window
|
# pitchf = pitchf[:, -feats_len:]
|
||||||
if feats.shape[1] < p_len:
|
# p_len = torch.tensor([feats_len], device=self.device).long()
|
||||||
p_len = feats.shape[1]
|
|
||||||
if pitch is not None and pitchf is not None:
|
|
||||||
pitch = pitch[:, :p_len]
|
|
||||||
pitchf = pitchf[:, :p_len]
|
|
||||||
|
|
||||||
feats_len = feats.shape[1]
|
# print("----------plen::1:", p_len)
|
||||||
if pitch is not None and pitchf is not None:
|
|
||||||
pitch = pitch[:, -feats_len:]
|
|
||||||
pitchf = pitchf[:, -feats_len:]
|
|
||||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
|
||||||
|
|
||||||
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
|
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
|
||||||
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
|
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
|
||||||
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
|
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
|
||||||
if protect < 0.5 and search_index:
|
if protect < 0.5:
|
||||||
pitchff = pitchf.clone()
|
pitchff = pitchf.clone()
|
||||||
pitchff[pitchf > 0] = 1
|
pitchff[pitchf > 0] = 1
|
||||||
pitchff[pitchf < 1] = protect
|
pitchff[pitchf < 1] = protect
|
||||||
pitchff = pitchff.unsqueeze(-1)
|
pitchff = pitchff.unsqueeze(-1)
|
||||||
feats = feats * pitchff + feats0 * (1 - pitchff)
|
feats = feats * pitchff + feats0 * (1 - pitchff)
|
||||||
feats = feats.to(feats0.dtype)
|
feats = feats.to(feats0.dtype)
|
||||||
p_len = torch.tensor([p_len], device=self.device).long()
|
# p_len = torch.tensor([p_len], device=self.device).long()
|
||||||
|
|
||||||
# apply silent front for inference
|
# # apply silent front for inference
|
||||||
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
# if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||||
npyOffset = math.floor(silence_front * 16000) // 360
|
# npyOffset = math.floor(silence_front * 16000) // 360 # 160x2 = 360
|
||||||
feats = feats[:, npyOffset * 2 :, :] # NOQA
|
# feats = feats[:, npyOffset * 2 :, :] # NOQA
|
||||||
|
|
||||||
feats_len = feats.shape[1]
|
|
||||||
if pitch is not None and pitchf is not None:
|
|
||||||
pitch = pitch[:, -feats_len:]
|
|
||||||
pitchf = pitchf[:, -feats_len:]
|
|
||||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
|
||||||
|
|
||||||
# 推論実行
|
# 推論実行
|
||||||
try:
|
try:
|
||||||
@ -224,7 +193,16 @@ class Pipeline(object):
|
|||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
audio1 = (
|
audio1 = (
|
||||||
torch.clip(
|
torch.clip(
|
||||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
|
self.inferencer.infer(
|
||||||
|
feats,
|
||||||
|
pitch.unsqueeze(-1),
|
||||||
|
volume,
|
||||||
|
mask,
|
||||||
|
sid,
|
||||||
|
infer_speedup=10,
|
||||||
|
k_step=20,
|
||||||
|
silence_front=silence_front
|
||||||
|
).to(dtype=torch.float32),
|
||||||
-1.0,
|
-1.0,
|
||||||
1.0,
|
1.0,
|
||||||
)
|
)
|
||||||
@ -243,16 +221,7 @@ class Pipeline(object):
|
|||||||
else:
|
else:
|
||||||
pitchf_buffer = None
|
pitchf_buffer = None
|
||||||
|
|
||||||
del p_len, padding_mask, pitch, pitchf, feats
|
del pitch, pitchf, feats, sid
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
# inferで出力されるサンプリングレートはモデルのサンプリングレートになる。
|
|
||||||
# pipelineに(入力されるときはhubertように16k)
|
|
||||||
if self.t_pad_tgt != 0:
|
|
||||||
offset = self.t_pad_tgt
|
|
||||||
end = -1 * self.t_pad_tgt
|
|
||||||
audio1 = audio1[offset:end]
|
|
||||||
|
|
||||||
del sid
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
return audio1, pitchf_buffer, feats_buffer
|
return audio1, pitchf_buffer, feats_buffer
|
||||||
|
@ -1,51 +1,48 @@
|
|||||||
import os
|
|
||||||
import traceback
|
import traceback
|
||||||
import faiss
|
from data.ModelSlot import DiffusionSVCModelSlot
|
||||||
from data.ModelSlot import DiffusionSVCModelSlot, RVCModelSlot
|
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
||||||
|
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||||
|
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
|
|
||||||
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||||
|
|
||||||
|
|
||||||
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
||||||
dev = DeviceManager.get_instance().getDevice(gpu)
|
dev = DeviceManager.get_instance().getDevice(gpu)
|
||||||
half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
# half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||||
|
half = False
|
||||||
|
|
||||||
# # Inferencer 生成
|
# Inferencer 生成
|
||||||
# try:
|
try:
|
||||||
# inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
|
inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
|
||||||
# except Exception as e:
|
except Exception as e:
|
||||||
# print("[Voice Changer] exception! loading inferencer", e)
|
print("[Voice Changer] exception! loading inferencer", e)
|
||||||
# traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
# # Embedder 生成
|
# Embedder 生成
|
||||||
# try:
|
try:
|
||||||
# embedder = EmbedderManager.getEmbedder(
|
embedder = EmbedderManager.getEmbedder(
|
||||||
# modelSlot.embedder,
|
modelSlot.embedder,
|
||||||
# # emmbedderFilename,
|
# emmbedderFilename,
|
||||||
# half,
|
half,
|
||||||
# dev,
|
dev,
|
||||||
# )
|
)
|
||||||
# except Exception as e:
|
except Exception as e:
|
||||||
# print("[Voice Changer] exception! loading embedder", e)
|
print("[Voice Changer] exception! loading embedder", e)
|
||||||
# traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
# # pitchExtractor
|
# pitchExtractor
|
||||||
# pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
embedder,
|
||||||
|
inferencer,
|
||||||
|
pitchExtractor,
|
||||||
|
modelSlot.samplingRate,
|
||||||
|
dev,
|
||||||
|
half,
|
||||||
|
)
|
||||||
|
|
||||||
# pipeline = Pipeline(
|
return pipeline
|
||||||
# embedder,
|
|
||||||
# inferencer,
|
|
||||||
# pitchExtractor,
|
|
||||||
# index,
|
|
||||||
# modelSlot.samplingRate,
|
|
||||||
# dev,
|
|
||||||
# half,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# return pipeline
|
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ class ModelSlotManager:
|
|||||||
def __init__(self, model_dir: str):
|
def __init__(self, model_dir: str):
|
||||||
self.model_dir = model_dir
|
self.model_dir = model_dir
|
||||||
self.modelSlots = loadAllSlotInfo(self.model_dir)
|
self.modelSlots = loadAllSlotInfo(self.model_dir)
|
||||||
|
print("MODEL SLOT INFO-------------->>>>>", self.modelSlots)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_instance(cls, model_dir: str):
|
def get_instance(cls, model_dir: str):
|
||||||
|
41
server/voice_changer/common/VolumeExtractor.py
Normal file
41
server/voice_changer/common/VolumeExtractor.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class VolumeExtractor:
|
||||||
|
def __init__(self, hop_size: float, block_size: int, model_sampling_rate: int, audio_sampling_rate: int):
|
||||||
|
self.hop_size = hop_size
|
||||||
|
self.block_size = block_size
|
||||||
|
self.model_sampling_rate = model_sampling_rate
|
||||||
|
self.audio_sampling_rate = audio_sampling_rate
|
||||||
|
# self.hop_size = self.block_size * self.audio_sampling_rate / self.model_sampling_rate # モデルの処理単位が512(Diffusion-SVC), 入力のサンプリングレートのサイズにhopsizeを合わせる。
|
||||||
|
|
||||||
|
def extract(self, audio): # audio: 1d numpy array
|
||||||
|
audio = audio.squeeze().cpu()
|
||||||
|
print("----VolExtractor2", audio.shape, self.block_size, self.model_sampling_rate, self.audio_sampling_rate, self.hop_size)
|
||||||
|
n_frames = int(len(audio) // self.hop_size) + 1
|
||||||
|
print("=======> n_frames", n_frames)
|
||||||
|
audio2 = audio ** 2
|
||||||
|
print("----VolExtractor3", audio2.shape)
|
||||||
|
audio2 = np.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode='reflect')
|
||||||
|
print("----VolExtractor4", audio2.shape)
|
||||||
|
volume = np.array(
|
||||||
|
[np.mean(audio2[int(n * self.hop_size): int((n + 1) * self.hop_size)]) for n in range(n_frames)])
|
||||||
|
volume = np.sqrt(volume)
|
||||||
|
return volume
|
||||||
|
|
||||||
|
def get_mask_from_volume(self, volume, threhold=-60.0, device='cpu') -> torch.Tensor:
|
||||||
|
mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
|
||||||
|
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
||||||
|
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
|
||||||
|
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
|
||||||
|
mask = upsample(mask, self.block_size).squeeze(-1)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor:
|
||||||
|
signal = signal.permute(0, 2, 1)
|
||||||
|
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1, mode='linear', align_corners=True)
|
||||||
|
signal = signal[:, :, :-1]
|
||||||
|
return signal.permute(0, 2, 1)
|
Loading…
x
Reference in New Issue
Block a user