WIP: diffusion svc refining
This commit is contained in:
parent
9eef8fcb11
commit
b9429c7655
2
client/demo/dist/index.js
vendored
2
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
@ -23,6 +23,9 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
|
|||||||
return <></>;
|
return <></>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const skipValues = getDivisors(serverSetting.serverSetting.kStep);
|
||||||
|
skipValues.pop();
|
||||||
|
|
||||||
const kStepRow = (
|
const kStepRow = (
|
||||||
<div className="character-area-control">
|
<div className="character-area-control">
|
||||||
<div className="character-area-control-title">k-step:</div>
|
<div className="character-area-control-title">k-step:</div>
|
||||||
@ -32,12 +35,15 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
|
|||||||
<span className="character-area-slider-control-slider">
|
<span className="character-area-slider-control-slider">
|
||||||
<input
|
<input
|
||||||
type="range"
|
type="range"
|
||||||
min="0"
|
min="2"
|
||||||
max={(selected as DiffusionSVCModelSlot).kStepMax}
|
max={(selected as DiffusionSVCModelSlot).kStepMax}
|
||||||
step="1"
|
step="1"
|
||||||
value={serverSetting.serverSetting.kStep}
|
value={serverSetting.serverSetting.kStep}
|
||||||
onChange={(e) => {
|
onChange={(e) => {
|
||||||
serverSetting.updateServerSettings({ ...serverSetting.serverSetting, kStep: Number(e.target.value) });
|
const newKStep = Number(e.target.value);
|
||||||
|
const newSkipValues = getDivisors(Number(e.target.value));
|
||||||
|
newSkipValues.pop();
|
||||||
|
serverSetting.updateServerSettings({ ...serverSetting.serverSetting, speedUp: Math.max(...newSkipValues), kStep: newKStep });
|
||||||
}}
|
}}
|
||||||
></input>
|
></input>
|
||||||
</span>
|
</span>
|
||||||
@ -48,23 +54,28 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
|
|||||||
);
|
);
|
||||||
const speedUpRow = (
|
const speedUpRow = (
|
||||||
<div className="character-area-control">
|
<div className="character-area-control">
|
||||||
<div className="character-area-control-title">speedup</div>
|
<div className="character-area-control-title">skip</div>
|
||||||
<div className="character-area-control-field">
|
<div className="character-area-control-field">
|
||||||
<div className="character-area-slider-control">
|
<div className="character-area-slider-control">
|
||||||
<span className="character-area-slider-control-kind"></span>
|
<span className="character-area-slider-control-kind"></span>
|
||||||
<span className="character-area-slider-control-slider">
|
<span className="character-area-slider-control-slider">
|
||||||
<input
|
<select
|
||||||
type="range"
|
name=""
|
||||||
min="0"
|
id=""
|
||||||
max={serverSetting.serverSetting.kStep}
|
|
||||||
step="1"
|
|
||||||
value={serverSetting.serverSetting.speedUp}
|
value={serverSetting.serverSetting.speedUp}
|
||||||
onChange={(e) => {
|
onChange={(e) => {
|
||||||
serverSetting.updateServerSettings({ ...serverSetting.serverSetting, speedUp: Number(e.target.value) });
|
serverSetting.updateServerSettings({ ...serverSetting.serverSetting, speedUp: Number(e.target.value) });
|
||||||
}}
|
}}
|
||||||
></input>
|
>
|
||||||
|
{skipValues.map((v) => {
|
||||||
|
return (
|
||||||
|
<option value={v} key={v}>
|
||||||
|
{v}
|
||||||
|
</option>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</select>
|
||||||
</span>
|
</span>
|
||||||
<span className="character-area-slider-control-val">{serverSetting.serverSetting.speedUp}</span>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -79,3 +90,19 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
|
|||||||
|
|
||||||
return settingArea;
|
return settingArea;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const getDivisors = (num: number) => {
|
||||||
|
var divisors = [];
|
||||||
|
var end = Math.sqrt(num);
|
||||||
|
|
||||||
|
for (var i = 1; i <= end; i++) {
|
||||||
|
if (num % i === 0) {
|
||||||
|
divisors.push(i);
|
||||||
|
if (i !== num / i) {
|
||||||
|
divisors.push(num / i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return divisors.sort((a, b) => a - b);
|
||||||
|
};
|
||||||
|
@ -119,6 +119,8 @@ class MMVC_Rest_Fileuploader:
|
|||||||
return JSONResponse(content=json_compatible_item_data)
|
return JSONResponse(content=json_compatible_item_data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("[Voice Changer] post_update_model_default ex:", e)
|
print("[Voice Changer] post_update_model_default ex:", e)
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
def post_update_model_info(self, newData: str = Form(...)):
|
def post_update_model_info(self, newData: str = Form(...)):
|
||||||
try:
|
try:
|
||||||
|
@ -202,10 +202,10 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"key": "defaultKstep",
|
"key": "defaultKstep",
|
||||||
"val": self.settings.kstep,
|
"val": self.settings.kStep,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"key": "defaultSpeedup",
|
"key": "defaultSpeedup",
|
||||||
"val": self.settings.speedup,
|
"val": self.settings.speedUp,
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
@ -6,6 +6,7 @@ from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2me
|
|||||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
||||||
|
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
from voice_changer.utils.Timer import Timer
|
||||||
|
|
||||||
|
|
||||||
class DiffusionSVCInferencer(Inferencer):
|
class DiffusionSVCInferencer(Inferencer):
|
||||||
@ -100,6 +101,7 @@ class DiffusionSVCInferencer(Inferencer):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def infer(
|
def infer(
|
||||||
self,
|
self,
|
||||||
|
audio_t: torch.Tensor,
|
||||||
feats: torch.Tensor,
|
feats: torch.Tensor,
|
||||||
pitch: torch.Tensor,
|
pitch: torch.Tensor,
|
||||||
volume: torch.Tensor,
|
volume: torch.Tensor,
|
||||||
@ -109,10 +111,22 @@ class DiffusionSVCInferencer(Inferencer):
|
|||||||
infer_speedup: int,
|
infer_speedup: int,
|
||||||
silence_front: float,
|
silence_front: float,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
with Timer("pre-process") as t:
|
||||||
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
|
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
||||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
# gt_spec = self.vocoder.extract(audio_t, 16000)
|
||||||
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
# gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||||
|
|
||||||
out_wav *= mask
|
# print("[ ----Timer::1: ]", t.secs)
|
||||||
|
|
||||||
|
with Timer("pre-process") as t:
|
||||||
|
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
|
||||||
|
|
||||||
|
# print("[ ----Timer::2: ]", t.secs)
|
||||||
|
with Timer("pre-process") as t: # NOQA
|
||||||
|
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||||
|
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||||
|
|
||||||
|
out_wav *= mask
|
||||||
|
# print("[ ----Timer::3: ]", t.secs, start_frame, out_mel.shape)
|
||||||
|
|
||||||
return out_wav.squeeze()
|
return out_wav.squeeze()
|
||||||
|
@ -0,0 +1,90 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import load_model_vocoder_from_combo
|
||||||
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
|
||||||
|
|
||||||
|
class DiffusionSVC_ONNX:
|
||||||
|
def __init__(self, file: str, gpu: int):
|
||||||
|
self.dev = DeviceManager.get_instance().getDevice(gpu)
|
||||||
|
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev)
|
||||||
|
self.diff_model = diff_model
|
||||||
|
self.naive_model = naive_model
|
||||||
|
self.vocoder = vocoder
|
||||||
|
self.diff_args = diff_args
|
||||||
|
self.naive_args = naive_args
|
||||||
|
|
||||||
|
def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
|
||||||
|
g = self.emb_g(sid).unsqueeze(-1)
|
||||||
|
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||||
|
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||||
|
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||||
|
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
|
||||||
|
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
|
||||||
|
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||||
|
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||||
|
spk_emb=None):
|
||||||
|
|
||||||
|
|
||||||
|
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
||||||
|
|
||||||
|
# spk_id
|
||||||
|
spk_emb_dict = None
|
||||||
|
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
||||||
|
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||||
|
# without speaker encoder
|
||||||
|
else:
|
||||||
|
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
|
||||||
|
|
||||||
|
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,aug_shift=0, spk_emb=None):
|
||||||
|
# spk_id
|
||||||
|
spk_emb_dict = None
|
||||||
|
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
||||||
|
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||||
|
# without speaker encoder
|
||||||
|
else:
|
||||||
|
|
||||||
|
return out_spec
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def mel2wav(self, mel, f0, start_frame=0):
|
||||||
|
if start_frame == 0:
|
||||||
|
return self.vocoder.infer(mel, f0)
|
||||||
|
else: # for realtime speedup
|
||||||
|
mel = mel[:, start_frame:, :]
|
||||||
|
f0 = f0[:, start_frame:, :]
|
||||||
|
out_wav = self.vocoder.infer(mel, f0)
|
||||||
|
return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0))
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def infer(
|
||||||
|
self,
|
||||||
|
feats: torch.Tensor,
|
||||||
|
pitch: torch.Tensor,
|
||||||
|
volume: torch.Tensor,
|
||||||
|
mask: torch.Tensor,
|
||||||
|
sid: torch.Tensor,
|
||||||
|
k_step: int,
|
||||||
|
infer_speedup: int,
|
||||||
|
silence_front: float,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
aug_shift = torch.LongTensor([0]).to(feats.device)
|
||||||
|
out_spec = self.naive_model(feats, pitch, volume, sid, spk_mix_dict=None,
|
||||||
|
aug_shift=aug_shift, infer=True,
|
||||||
|
spk_emb=None, spk_emb_dict=None)
|
||||||
|
|
||||||
|
|
||||||
|
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
||||||
|
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
|
||||||
|
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||||
|
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||||
|
|
||||||
|
out_wav *= mask
|
||||||
|
return out_wav.squeeze()
|
125
server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py
Normal file
125
server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
from onnxsim import simplify
|
||||||
|
import onnx
|
||||||
|
from const import TMP_DIR, EnumInferenceTypes
|
||||||
|
from data.ModelSlot import DiffusionSVCModelSlot
|
||||||
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
|
||||||
|
|
||||||
|
def export2onnx(gpu: int, modelSlot: DiffusionSVCModelSlot):
|
||||||
|
modelFile = modelSlot.modelFile
|
||||||
|
|
||||||
|
output_file = os.path.splitext(os.path.basename(modelFile))[0] + ".onnx"
|
||||||
|
output_file_simple = os.path.splitext(os.path.basename(modelFile))[0] + "_simple.onnx"
|
||||||
|
output_path = os.path.join(TMP_DIR, output_file)
|
||||||
|
output_path_simple = os.path.join(TMP_DIR, output_file_simple)
|
||||||
|
metadata = {
|
||||||
|
"application": "VC_CLIENT",
|
||||||
|
"version": "3",
|
||||||
|
"voiceChangerType": modelSlot.voiceChangerType,
|
||||||
|
"modelType": modelSlot.modelType,
|
||||||
|
"samplingRate": modelSlot.samplingRate,
|
||||||
|
"embChannels": modelSlot.embChannels,
|
||||||
|
"embedder": modelSlot.embedder
|
||||||
|
}
|
||||||
|
gpuMomory = DeviceManager.get_instance().getDeviceMemory(gpu)
|
||||||
|
print(f"[Voice Changer] exporting onnx... gpu_id:{gpu} gpu_mem:{gpuMomory}")
|
||||||
|
|
||||||
|
if gpuMomory > 0:
|
||||||
|
_export2onnx(modelFile, output_path, output_path_simple, True, metadata)
|
||||||
|
else:
|
||||||
|
print("[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled.")
|
||||||
|
_export2onnx(modelFile, output_path, output_path_simple, False, metadata)
|
||||||
|
return output_file_simple
|
||||||
|
|
||||||
|
|
||||||
|
def _export2onnx(input_model, output_model, output_model_simple, is_half, metadata):
|
||||||
|
cpt = torch.load(input_model, map_location="cpu")
|
||||||
|
if is_half:
|
||||||
|
dev = torch.device("cuda", index=0)
|
||||||
|
else:
|
||||||
|
dev = torch.device("cpu")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# EnumInferenceTypesのままだとシリアライズできないのでテキスト化
|
||||||
|
if metadata["modelType"] == EnumInferenceTypes.pyTorchRVC.value:
|
||||||
|
net_g_onnx = SynthesizerTrnMs256NSFsid_ONNX(*cpt["config"], is_half=is_half)
|
||||||
|
elif metadata["modelType"] == EnumInferenceTypes.pyTorchWebUI.value:
|
||||||
|
net_g_onnx = SynthesizerTrnMsNSFsid_webui_ONNX(**cpt["params"], is_half=is_half)
|
||||||
|
elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCNono.value:
|
||||||
|
net_g_onnx = SynthesizerTrnMs256NSFsid_nono_ONNX(*cpt["config"])
|
||||||
|
elif metadata["modelType"] == EnumInferenceTypes.pyTorchWebUINono.value:
|
||||||
|
net_g_onnx = SynthesizerTrnMsNSFsidNono_webui_ONNX(**cpt["params"])
|
||||||
|
elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCv2.value:
|
||||||
|
net_g_onnx = SynthesizerTrnMs768NSFsid_ONNX(*cpt["config"], is_half=is_half)
|
||||||
|
elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCv2Nono.value:
|
||||||
|
net_g_onnx = SynthesizerTrnMs768NSFsid_nono_ONNX(*cpt["config"])
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"unknwon::::: ",
|
||||||
|
metadata["modelType"],
|
||||||
|
EnumInferenceTypes.pyTorchRVCv2.value,
|
||||||
|
)
|
||||||
|
|
||||||
|
net_g_onnx.eval().to(dev)
|
||||||
|
net_g_onnx.load_state_dict(cpt["weight"], strict=False)
|
||||||
|
if is_half:
|
||||||
|
net_g_onnx = net_g_onnx.half()
|
||||||
|
|
||||||
|
if is_half:
|
||||||
|
feats = torch.HalfTensor(1, 2192, metadata["embChannels"]).to(dev)
|
||||||
|
else:
|
||||||
|
feats = torch.FloatTensor(1, 2192, metadata["embChannels"]).to(dev)
|
||||||
|
p_len = torch.LongTensor([2192]).to(dev)
|
||||||
|
sid = torch.LongTensor([0]).to(dev)
|
||||||
|
|
||||||
|
if metadata["f0"] is True:
|
||||||
|
pitch = torch.zeros(1, 2192, dtype=torch.int64).to(dev)
|
||||||
|
pitchf = torch.FloatTensor(1, 2192).to(dev)
|
||||||
|
input_names = ["feats", "p_len", "pitch", "pitchf", "sid"]
|
||||||
|
inputs = (
|
||||||
|
feats,
|
||||||
|
p_len,
|
||||||
|
pitch,
|
||||||
|
pitchf,
|
||||||
|
sid,
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
input_names = ["feats", "p_len", "sid"]
|
||||||
|
inputs = (
|
||||||
|
feats,
|
||||||
|
p_len,
|
||||||
|
sid,
|
||||||
|
)
|
||||||
|
|
||||||
|
output_names = [
|
||||||
|
"audio",
|
||||||
|
]
|
||||||
|
|
||||||
|
torch.onnx.export(
|
||||||
|
net_g_onnx,
|
||||||
|
inputs,
|
||||||
|
output_model,
|
||||||
|
dynamic_axes={
|
||||||
|
"feats": [1],
|
||||||
|
"pitch": [1],
|
||||||
|
"pitchf": [1],
|
||||||
|
},
|
||||||
|
do_constant_folding=False,
|
||||||
|
opset_version=17,
|
||||||
|
verbose=False,
|
||||||
|
input_names=input_names,
|
||||||
|
output_names=output_names,
|
||||||
|
)
|
||||||
|
|
||||||
|
model_onnx2 = onnx.load(output_model)
|
||||||
|
model_simp, check = simplify(model_onnx2)
|
||||||
|
meta = model_simp.metadata_props.add()
|
||||||
|
meta.key = "metadata"
|
||||||
|
meta.value = json.dumps(metadata)
|
||||||
|
onnx.save(model_simp, output_model_simple)
|
@ -16,6 +16,8 @@ from voice_changer.RVC.embedder.Embedder import Embedder
|
|||||||
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
||||||
from torchaudio.transforms import Resample
|
from torchaudio.transforms import Resample
|
||||||
|
|
||||||
|
from voice_changer.utils.Timer import Timer
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(object):
|
class Pipeline(object):
|
||||||
embedder: Embedder
|
embedder: Embedder
|
||||||
@ -112,83 +114,95 @@ class Pipeline(object):
|
|||||||
useFinalProj,
|
useFinalProj,
|
||||||
protect=0.5
|
protect=0.5
|
||||||
):
|
):
|
||||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
# print("---------- pipe line --------------------")
|
||||||
audio16k = self.resamplerIn(audio_t)
|
with Timer("pre-process") as t:
|
||||||
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
|
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
audio16k = self.resamplerIn(audio_t)
|
||||||
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
|
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
|
||||||
|
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||||
|
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
|
||||||
|
# print("[Timer::1: ]", t.secs)
|
||||||
|
|
||||||
# ピッチ検出
|
with Timer("pre-process") as t:
|
||||||
try:
|
# ピッチ検出
|
||||||
pitch = self.pitchExtractor.extract(
|
|
||||||
audio16k.squeeze(),
|
|
||||||
pitchf,
|
|
||||||
f0_up_key,
|
|
||||||
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
|
||||||
silence_front=silence_front,
|
|
||||||
)
|
|
||||||
|
|
||||||
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
|
|
||||||
except IndexError as e: # NOQA
|
|
||||||
raise NotEnoughDataExtimateF0()
|
|
||||||
|
|
||||||
# tensor型調整
|
|
||||||
feats = audio16k.squeeze()
|
|
||||||
if feats.dim() == 2: # double channels
|
|
||||||
feats = feats.mean(-1)
|
|
||||||
feats = feats.view(1, -1)
|
|
||||||
|
|
||||||
# embedding
|
|
||||||
with autocast(enabled=self.isHalf):
|
|
||||||
try:
|
try:
|
||||||
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
pitch = self.pitchExtractor.extract(
|
||||||
if torch.isnan(feats).all():
|
audio16k.squeeze(),
|
||||||
raise DeviceCannotSupportHalfPrecisionException()
|
pitchf,
|
||||||
|
f0_up_key,
|
||||||
|
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||||
|
silence_front=silence_front,
|
||||||
|
)
|
||||||
|
|
||||||
|
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
|
||||||
|
except IndexError as e: # NOQA
|
||||||
|
raise NotEnoughDataExtimateF0()
|
||||||
|
|
||||||
|
# tensor型調整
|
||||||
|
feats = audio16k.squeeze()
|
||||||
|
if feats.dim() == 2: # double channels
|
||||||
|
feats = feats.mean(-1)
|
||||||
|
feats = feats.view(1, -1)
|
||||||
|
# print("[Timer::2: ]", t.secs)
|
||||||
|
|
||||||
|
with Timer("pre-process") as t:
|
||||||
|
|
||||||
|
# embedding
|
||||||
|
with autocast(enabled=self.isHalf):
|
||||||
|
try:
|
||||||
|
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
||||||
|
if torch.isnan(feats).all():
|
||||||
|
raise DeviceCannotSupportHalfPrecisionException()
|
||||||
|
except RuntimeError as e:
|
||||||
|
if "HALF" in e.__str__().upper():
|
||||||
|
raise HalfPrecisionChangingException()
|
||||||
|
elif "same device" in e.__str__():
|
||||||
|
raise DeviceChangingException()
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
|
||||||
|
# print("[Timer::3: ]", t.secs)
|
||||||
|
|
||||||
|
with Timer("pre-process") as t:
|
||||||
|
# 推論実行
|
||||||
|
try:
|
||||||
|
with torch.no_grad():
|
||||||
|
with autocast(enabled=self.isHalf):
|
||||||
|
audio1 = (
|
||||||
|
torch.clip(
|
||||||
|
self.inferencer.infer(
|
||||||
|
audio16k,
|
||||||
|
feats,
|
||||||
|
pitch.unsqueeze(-1),
|
||||||
|
volume,
|
||||||
|
mask,
|
||||||
|
sid,
|
||||||
|
k_step,
|
||||||
|
infer_speedup,
|
||||||
|
silence_front=silence_front
|
||||||
|
).to(dtype=torch.float32),
|
||||||
|
-1.0,
|
||||||
|
1.0,
|
||||||
|
)
|
||||||
|
* 32767.5
|
||||||
|
).data.to(dtype=torch.int16)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
if "HALF" in e.__str__().upper():
|
if "HALF" in e.__str__().upper():
|
||||||
|
print("11", e)
|
||||||
raise HalfPrecisionChangingException()
|
raise HalfPrecisionChangingException()
|
||||||
elif "same device" in e.__str__():
|
|
||||||
raise DeviceChangingException()
|
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
|
# print("[Timer::4: ]", t.secs)
|
||||||
|
|
||||||
# 推論実行
|
with Timer("pre-process") as t: # NOQA
|
||||||
try:
|
feats_buffer = feats.squeeze(0).detach().cpu()
|
||||||
with torch.no_grad():
|
if pitch is not None:
|
||||||
with autocast(enabled=self.isHalf):
|
pitch_buffer = pitch.squeeze(0).detach().cpu()
|
||||||
print("[EMBEDDER EXTRACT:::]", feats.shape, pitch.unsqueeze(-1).shape, volume.shape, mask.shape)
|
|
||||||
audio1 = (
|
|
||||||
torch.clip(
|
|
||||||
self.inferencer.infer(
|
|
||||||
feats,
|
|
||||||
pitch.unsqueeze(-1),
|
|
||||||
volume,
|
|
||||||
mask,
|
|
||||||
sid,
|
|
||||||
k_step,
|
|
||||||
infer_speedup,
|
|
||||||
silence_front=silence_front
|
|
||||||
).to(dtype=torch.float32),
|
|
||||||
-1.0,
|
|
||||||
1.0,
|
|
||||||
)
|
|
||||||
* 32767.5
|
|
||||||
).data.to(dtype=torch.int16)
|
|
||||||
except RuntimeError as e:
|
|
||||||
if "HALF" in e.__str__().upper():
|
|
||||||
print("11", e)
|
|
||||||
raise HalfPrecisionChangingException()
|
|
||||||
else:
|
else:
|
||||||
raise e
|
pitch_buffer = None
|
||||||
|
|
||||||
feats_buffer = feats.squeeze(0).detach().cpu()
|
del pitch, pitchf, feats, sid
|
||||||
if pitch is not None:
|
torch.cuda.empty_cache()
|
||||||
pitch_buffer = pitch.squeeze(0).detach().cpu()
|
audio1 = self.resamplerOut(audio1.float())
|
||||||
else:
|
# print("[Timer::5: ]", t.secs)
|
||||||
pitch_buffer = None
|
|
||||||
|
|
||||||
del pitch, pitchf, feats, sid
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
audio1 = self.resamplerOut(audio1.float())
|
|
||||||
return audio1, pitch_buffer, feats_buffer
|
return audio1, pitch_buffer, feats_buffer
|
||||||
|
@ -18,6 +18,7 @@ class DioPitchExtractor(PitchExtractor):
|
|||||||
|
|
||||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
||||||
audio = audio.detach().cpu().numpy()
|
audio = audio.detach().cpu().numpy()
|
||||||
|
silence_front = 0 # TODO: chunkサイズが小さいときに音程を取れなくなる対策
|
||||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
start_frame = int(silence_front * self.sapmle_rate / window)
|
||||||
real_silence_front = start_frame * window / self.sapmle_rate
|
real_silence_front = start_frame * window / self.sapmle_rate
|
||||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
||||||
|
@ -10,7 +10,7 @@ class VoiceChangerIF(Protocol):
|
|||||||
|
|
||||||
def get_info(self) -> dict[str, Any]:
|
def get_info(self) -> dict[str, Any]:
|
||||||
...
|
...
|
||||||
|
|
||||||
def get_performance(self) -> list[int]:
|
def get_performance(self) -> list[int]:
|
||||||
...
|
...
|
||||||
|
|
||||||
@ -25,4 +25,3 @@ class VoiceChangerIF(Protocol):
|
|||||||
|
|
||||||
def export2onnx() -> Any:
|
def export2onnx() -> Any:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user