draft commit
This commit is contained in:
parent
fd17bb59c8
commit
92cd384486
@ -109,6 +109,7 @@ class RVC(VoiceChangerModel):
|
||||
|
||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (128 - (convertSize % 128))
|
||||
outSize = convertSize - self.settings.extraConvertSize
|
||||
|
||||
# バッファがたまっていない場合はzeroで補う
|
||||
if self.audio_buffer.shape[0] < convertSize:
|
||||
@ -132,15 +133,16 @@ class RVC(VoiceChangerModel):
|
||||
vol = max(vol, self.prevVol * 0.0)
|
||||
self.prevVol = vol
|
||||
|
||||
return (audio_buffer, convertSize, vol)
|
||||
return (audio_buffer, convertSize, vol, outSize)
|
||||
|
||||
def inference(self, data):
|
||||
audio = data[0]
|
||||
convertSize = data[1]
|
||||
vol = data[2]
|
||||
outSize = data[3]
|
||||
|
||||
if vol < self.settings.silentThreshold:
|
||||
return np.zeros(convertSize).astype(np.int16)
|
||||
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
|
||||
|
||||
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
|
||||
repeat = 1 if self.settings.rvcQuality else 0
|
||||
@ -165,6 +167,7 @@ class RVC(VoiceChangerModel):
|
||||
useFinalProj,
|
||||
repeat,
|
||||
protect,
|
||||
outSize
|
||||
)
|
||||
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||
|
||||
|
@ -3,6 +3,7 @@ from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from .rvc_models.infer_pack.models import SynthesizerTrnMs768NSFsid
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class RVCInferencerv2(Inferencer):
|
||||
@ -32,5 +33,6 @@ class RVCInferencerv2(Inferencer):
|
||||
pitch: torch.Tensor,
|
||||
pitchf: torch.Tensor,
|
||||
sid: torch.Tensor,
|
||||
out_length: Optional[int] = None,
|
||||
) -> torch.Tensor:
|
||||
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
|
||||
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=out_length)
|
||||
|
@ -129,12 +129,12 @@ class SynthesizerTrnMsNSFsid(nn.Module):
|
||||
o = self.dec(z_slice, pitchf, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
|
||||
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
||||
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
|
||||
@ -208,10 +208,10 @@ class SynthesizerTrnMsNSFsidNono(nn.Module):
|
||||
o = self.dec(z_slice, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, sid, max_len=None):
|
||||
def infer(self, phone, phone_lengths, sid, max_len=None, out_length=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
|
||||
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=out_length)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||
|
@ -232,6 +232,25 @@ class Generator(torch.nn.Module):
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
||||
|
||||
# Compute the minimum size required for estimating real-time speech conversion.
|
||||
self.realtime = False
|
||||
if resblock != "1":
|
||||
self.realtime = True
|
||||
self.ups_size = [0 for _ in range(len(self.ups))]
|
||||
# conv_post
|
||||
self.ups_size[-1] += 3
|
||||
|
||||
for i in range(len(self.ups)-1, -1, -1):
|
||||
for k, d in zip(resblock_kernel_sizes[::-1], resblock_dilation_sizes[::-1]):
|
||||
# conv2
|
||||
self.ups_size[i] += (k - 1)//2
|
||||
# conv1
|
||||
self.ups_size[i] += d * (k - 1)//2
|
||||
# upsampling
|
||||
self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2
|
||||
if i:
|
||||
self.ups_size[i-1] = self.ups_size[i] + 0
|
||||
|
||||
def forward(self, x, g=None):
|
||||
x = self.conv_pre(x)
|
||||
if g is not None:
|
||||
@ -253,6 +272,35 @@ class Generator(torch.nn.Module):
|
||||
|
||||
return x
|
||||
|
||||
def infer_realtime(self, x, g=None, convert_length=None):
|
||||
out_length = x.shape[2] * np.prod(self.upsample_rates)
|
||||
if convert_length is None:
|
||||
convert_length = x.shape[2] * np.prod(self.upsample_rates)
|
||||
|
||||
x = self.conv_pre(x)
|
||||
if g is not None:
|
||||
x = x + self.cond(g)
|
||||
|
||||
for i in range(self.num_upsamples):
|
||||
if self.realtime:
|
||||
x = x[:, :, -self.ups_size[i] + (-convert_length // np.prod(self.upsample_rates[i:])):]
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
x = self.ups[i](x)
|
||||
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||
else:
|
||||
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
out = torch.zeros([x.shape[0], 1, x.shape[0] * np.prod(self.upsample_rates)], device=x.device, dtype=x.dtype)
|
||||
out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:]
|
||||
return out
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
@ -404,6 +452,7 @@ class GeneratorNSF(torch.nn.Module):
|
||||
super(GeneratorNSF, self).__init__()
|
||||
self.num_kernels = len(resblock_kernel_sizes)
|
||||
self.num_upsamples = len(upsample_rates)
|
||||
self.upsample_rates = upsample_rates
|
||||
|
||||
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
|
||||
self.m_source = SourceModuleHnNSF(sampling_rate=sr, harmonic_num=0, is_half=is_half)
|
||||
@ -453,6 +502,29 @@ class GeneratorNSF(torch.nn.Module):
|
||||
|
||||
self.upp = np.prod(upsample_rates)
|
||||
|
||||
# Compute the minimum size required for estimating real-time speech conversion.
|
||||
self.realtime = False
|
||||
if resblock != "1":
|
||||
self.realtime = True
|
||||
self.ups_size = [0 for _ in range(len(self.ups))]
|
||||
self.noise_conv_size = [0 for _ in range(len(self.ups))]
|
||||
# conv_post
|
||||
self.ups_size[-1] += 3
|
||||
|
||||
for i in range(len(self.ups)-1, -1, -1):
|
||||
for k, d in zip(resblock_kernel_sizes[::-1], resblock_dilation_sizes[::-1]):
|
||||
# conv2
|
||||
self.ups_size[i] += (k - 1)//2
|
||||
# conv1
|
||||
self.ups_size[i] += d[-1] * (k - 1)//2
|
||||
# noise_conv
|
||||
self.noise_conv_size[i] = self.ups_size[i] * np.prod(upsample_rates[i:])
|
||||
# upsampling
|
||||
|
||||
self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2
|
||||
if i:
|
||||
self.ups_size[i-1] = self.ups_size[i] + 0
|
||||
|
||||
def forward(self, x, f0, g=None):
|
||||
har_source, noi_source, uv = self.m_source(f0, self.upp)
|
||||
har_source = har_source.transpose(1, 2)
|
||||
@ -477,6 +549,42 @@ class GeneratorNSF(torch.nn.Module):
|
||||
x = torch.tanh(x)
|
||||
return x
|
||||
|
||||
def infer_realtime(self, x, f0, g=None, convert_length=None):
|
||||
out_length = x.shape[2] * np.prod(self.upsample_rates)
|
||||
if convert_length is None:
|
||||
convert_length = x.shape[2] * np.prod(self.upsample_rates)
|
||||
|
||||
har_source, noi_source, uv = self.m_source(f0, self.upp)
|
||||
har_source = har_source.transpose(1, 2)
|
||||
x = self.conv_pre(x)
|
||||
if g is not None:
|
||||
x = x + self.cond(g)
|
||||
|
||||
for i in range(self.num_upsamples):
|
||||
if self.realtime:
|
||||
x = x[:, :, -self.ups_size[i] + (-convert_length // np.prod(self.upsample_rates[i:])):]
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
x_ = self.ups[i](x)
|
||||
x_source = self.noise_convs[i](har_source[:, :, -convert_length - self.noise_conv_size[i]:])
|
||||
x = torch.zeros([x_.shape[0], x_.shape[1], max(x_.shape[2], x_source.shape[2])], device=x.device, dtype=x.dtype)
|
||||
x[:, :, -x_.shape[2]:] += x_
|
||||
x[:, :, -x_source.shape[2]:] += x_source
|
||||
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||
else:
|
||||
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
out = torch.zeros([x.shape[0], 1, out_length], device=x.device, dtype=x.dtype)
|
||||
out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:]
|
||||
|
||||
return x #out
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
@ -566,12 +674,12 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
|
||||
o = self.dec(z_slice, pitchf, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
|
||||
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
||||
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
|
||||
@ -650,12 +758,12 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
|
||||
o = self.dec(z_slice, pitchf, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
|
||||
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
||||
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
|
||||
@ -727,12 +835,12 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
||||
o = self.dec(z_slice, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, sid, max_len=None):
|
||||
def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
|
||||
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
|
||||
@ -804,12 +912,12 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
|
||||
o = self.dec(z_slice, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, sid, max_len=None):
|
||||
def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
|
||||
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
|
||||
|
@ -79,6 +79,7 @@ class Pipeline(object):
|
||||
useFinalProj,
|
||||
repeat,
|
||||
protect=0.5,
|
||||
out_size=None,
|
||||
):
|
||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||
|
||||
@ -206,7 +207,7 @@ class Pipeline(object):
|
||||
with autocast(enabled=self.isHalf):
|
||||
audio1 = (
|
||||
torch.clip(
|
||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32),
|
||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
|
||||
-1.0,
|
||||
1.0,
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user