draft commit

This commit is contained in:
nadare 2023-07-01 12:06:14 +09:00
parent fd17bb59c8
commit 92cd384486
5 changed files with 130 additions and 16 deletions

View File

@ -109,6 +109,7 @@ class RVC(VoiceChangerModel):
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
outSize = convertSize - self.settings.extraConvertSize
# バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize:
@ -132,15 +133,16 @@ class RVC(VoiceChangerModel):
vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol
return (audio_buffer, convertSize, vol)
return (audio_buffer, convertSize, vol, outSize)
def inference(self, data):
audio = data[0]
convertSize = data[1]
vol = data[2]
outSize = data[3]
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
repeat = 1 if self.settings.rvcQuality else 0
@ -165,6 +167,7 @@ class RVC(VoiceChangerModel):
useFinalProj,
repeat,
protect,
outSize
)
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)

View File

@ -3,6 +3,7 @@ from const import EnumInferenceTypes
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from .rvc_models.infer_pack.models import SynthesizerTrnMs768NSFsid
from typing import Optional
class RVCInferencerv2(Inferencer):
@ -32,5 +33,6 @@ class RVCInferencerv2(Inferencer):
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
out_length: Optional[int] = None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=out_length)

View File

@ -129,12 +129,12 @@ class SynthesizerTrnMsNSFsid(nn.Module):
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)
@ -208,10 +208,10 @@ class SynthesizerTrnMsNSFsidNono(nn.Module):
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, max_len=None):
def infer(self, phone, phone_lengths, sid, max_len=None, out_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=out_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -232,6 +232,25 @@ class Generator(torch.nn.Module):
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
# Compute the minimum size required for estimating real-time speech conversion.
self.realtime = False
if resblock != "1":
self.realtime = True
self.ups_size = [0 for _ in range(len(self.ups))]
# conv_post
self.ups_size[-1] += 3
for i in range(len(self.ups)-1, -1, -1):
for k, d in zip(resblock_kernel_sizes[::-1], resblock_dilation_sizes[::-1]):
# conv2
self.ups_size[i] += (k - 1)//2
# conv1
self.ups_size[i] += d * (k - 1)//2
# upsampling
self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2
if i:
self.ups_size[i-1] = self.ups_size[i] + 0
def forward(self, x, g=None):
x = self.conv_pre(x)
if g is not None:
@ -253,6 +272,35 @@ class Generator(torch.nn.Module):
return x
def infer_realtime(self, x, g=None, convert_length=None):
out_length = x.shape[2] * np.prod(self.upsample_rates)
if convert_length is None:
convert_length = x.shape[2] * np.prod(self.upsample_rates)
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
if self.realtime:
x = x[:, :, -self.ups_size[i] + (-convert_length // np.prod(self.upsample_rates[i:])):]
x = F.leaky_relu(x, LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
out = torch.zeros([x.shape[0], 1, x.shape[0] * np.prod(self.upsample_rates)], device=x.device, dtype=x.dtype)
out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:]
return out
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
@ -404,6 +452,7 @@ class GeneratorNSF(torch.nn.Module):
super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.upsample_rates = upsample_rates
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF(sampling_rate=sr, harmonic_num=0, is_half=is_half)
@ -453,6 +502,29 @@ class GeneratorNSF(torch.nn.Module):
self.upp = np.prod(upsample_rates)
# Compute the minimum size required for estimating real-time speech conversion.
self.realtime = False
if resblock != "1":
self.realtime = True
self.ups_size = [0 for _ in range(len(self.ups))]
self.noise_conv_size = [0 for _ in range(len(self.ups))]
# conv_post
self.ups_size[-1] += 3
for i in range(len(self.ups)-1, -1, -1):
for k, d in zip(resblock_kernel_sizes[::-1], resblock_dilation_sizes[::-1]):
# conv2
self.ups_size[i] += (k - 1)//2
# conv1
self.ups_size[i] += d[-1] * (k - 1)//2
# noise_conv
self.noise_conv_size[i] = self.ups_size[i] * np.prod(upsample_rates[i:])
# upsampling
self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2
if i:
self.ups_size[i-1] = self.ups_size[i] + 0
def forward(self, x, f0, g=None):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
@ -477,6 +549,42 @@ class GeneratorNSF(torch.nn.Module):
x = torch.tanh(x)
return x
def infer_realtime(self, x, f0, g=None, convert_length=None):
out_length = x.shape[2] * np.prod(self.upsample_rates)
if convert_length is None:
convert_length = x.shape[2] * np.prod(self.upsample_rates)
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
if self.realtime:
x = x[:, :, -self.ups_size[i] + (-convert_length // np.prod(self.upsample_rates[i:])):]
x = F.leaky_relu(x, LRELU_SLOPE)
x_ = self.ups[i](x)
x_source = self.noise_convs[i](har_source[:, :, -convert_length - self.noise_conv_size[i]:])
x = torch.zeros([x_.shape[0], x_.shape[1], max(x_.shape[2], x_source.shape[2])], device=x.device, dtype=x.dtype)
x[:, :, -x_.shape[2]:] += x_
x[:, :, -x_source.shape[2]:] += x_source
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
out = torch.zeros([x.shape[0], 1, out_length], device=x.device, dtype=x.dtype)
out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:]
return x #out
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
@ -566,12 +674,12 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)
@ -650,12 +758,12 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)
@ -727,12 +835,12 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, max_len=None):
def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)
@ -804,12 +912,12 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, max_len=None):
def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -79,6 +79,7 @@ class Pipeline(object):
useFinalProj,
repeat,
protect=0.5,
out_size=None,
):
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
@ -206,7 +207,7 @@ class Pipeline(object):
with autocast(enabled=self.isHalf):
audio1 = (
torch.clip(
self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32),
self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
-1.0,
1.0,
)