WIP: first communication confirmation
This commit is contained in:
parent
fbabfed566
commit
a1db94c0af
@ -6,15 +6,15 @@ import torch
|
||||
import torchaudio
|
||||
from data.ModelSlot import DiffusionSVCModelSlot
|
||||
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
||||
|
||||
from Exceptions import DeviceCannotSupportHalfPrecisionException
|
||||
|
||||
@ -165,8 +165,6 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||
result = audio_out.detach().cpu().numpy()
|
||||
|
||||
print("RESULT", result)
|
||||
|
||||
return result
|
||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||
print("[Device Manager] Device cannot support half precision. Fallback to float....")
|
||||
|
@ -0,0 +1,169 @@
|
||||
|
||||
from torchaudio.transforms import Resample
|
||||
import pyworld as pw
|
||||
import numpy as np
|
||||
import torchcrepe
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
CREPE_RESAMPLE_KERNEL = {}
|
||||
|
||||
|
||||
def median_pool_1d(x, kernel_size):
|
||||
x = x.unsqueeze(1)
|
||||
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
|
||||
x = x.squeeze(1)
|
||||
x = x.unfold(1, kernel_size, 1)
|
||||
x, _ = torch.sort(x, dim=-1)
|
||||
return x[:, :, (kernel_size - 1) // 2]
|
||||
|
||||
|
||||
def masked_avg_pool_1d(x, kernel_size):
|
||||
x = x.unsqueeze(1)
|
||||
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
|
||||
mask = ~torch.isnan(x)
|
||||
masked_x = torch.where(mask, x, torch.zeros_like(x))
|
||||
ones_kernel = torch.ones(x.size(1), 1, kernel_size, device=x.device)
|
||||
|
||||
# Perform sum pooling
|
||||
sum_pooled = F.conv1d(
|
||||
masked_x,
|
||||
ones_kernel,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=x.size(1),
|
||||
)
|
||||
|
||||
# Count the non-masked (valid) elements in each pooling window
|
||||
valid_count = F.conv1d(
|
||||
mask.float(),
|
||||
ones_kernel,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=x.size(1),
|
||||
)
|
||||
valid_count = valid_count.clamp(min=1) # Avoid division by zero
|
||||
|
||||
# Perform masked average pooling
|
||||
avg_pooled = sum_pooled / valid_count
|
||||
|
||||
return avg_pooled.squeeze(1)
|
||||
|
||||
|
||||
class F0_Extractor:
|
||||
def __init__(self, f0_extractor, sample_rate=44100, hop_size=512, f0_min=65, f0_max=800,
|
||||
block_size=None, model_sampling_rate=None):
|
||||
self.block_size = block_size
|
||||
self.model_sampling_rate = model_sampling_rate
|
||||
self.f0_extractor = f0_extractor
|
||||
self.sample_rate = sample_rate
|
||||
self.hop_size = hop_size
|
||||
self.f0_min = f0_min
|
||||
self.f0_max = f0_max
|
||||
self.transformer_f0 = None
|
||||
if f0_extractor == 'crepe':
|
||||
key_str = str(sample_rate)
|
||||
if key_str not in CREPE_RESAMPLE_KERNEL:
|
||||
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
|
||||
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
|
||||
if (self.block_size is not None) or (self.model_sampling_rate is not None):
|
||||
assert (self.block_size is not None) and (self.model_sampling_rate is not None)
|
||||
self.hop_size_follow_input = True
|
||||
else:
|
||||
self.hop_size_follow_input = False
|
||||
|
||||
@torch.no_grad()
|
||||
def extract_f0(self, audio, key=0, sr=44100, silence_front=0):
|
||||
f0 = self.extract(audio.cpu().numpy(), uv_interp=True, silence_front=silence_front, sr=sr)
|
||||
# f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||
print("[PITCH_F0_ORG1]", f0)
|
||||
f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0)
|
||||
f0 = f0 * 2 ** (float(key) / 12)
|
||||
print("[PITCH_F0_ORG2]", f0)
|
||||
return f0
|
||||
|
||||
def extract(self, audio, uv_interp=False, device=None, silence_front=0, sr=None): # audio: 1d numpy array
|
||||
if sr is not None:
|
||||
assert self.hop_size_follow_input
|
||||
self.hop_size = self.block_size * sr / self.model_sampling_rate
|
||||
if (self.f0_extractor == 'crepe') and (sr != self.sample_rate):
|
||||
key_str = str(sr)
|
||||
if key_str not in CREPE_RESAMPLE_KERNEL:
|
||||
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sr, 16000, lowpass_filter_width=128)
|
||||
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
|
||||
self.sample_rate = sr
|
||||
|
||||
# extractor start time
|
||||
raw_audio = audio
|
||||
n_frames = int(len(audio) // self.hop_size) + 1
|
||||
|
||||
start_frame = int(silence_front * self.sample_rate / self.hop_size)
|
||||
real_silence_front = start_frame * self.hop_size / self.sample_rate
|
||||
audio = audio[int(np.round(real_silence_front * self.sample_rate)):]
|
||||
|
||||
if self.f0_extractor == 'dio':
|
||||
_f0, t = pw.dio(
|
||||
audio.astype('double'),
|
||||
self.sample_rate,
|
||||
f0_floor=self.f0_min,
|
||||
f0_ceil=self.f0_max,
|
||||
channels_in_octave=2,
|
||||
frame_period=(1000 * self.hop_size / self.sample_rate))
|
||||
f0 = pw.stonemask(audio.astype('double'), _f0, t, self.sample_rate)
|
||||
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
|
||||
# extract f0 using harvest
|
||||
elif self.f0_extractor == 'harvest':
|
||||
print("[SRC AUDIO2]", audio[:10])
|
||||
print("_____hopsize______", (1000 * self.hop_size / self.sample_rate), self.sample_rate)
|
||||
f0, _ = pw.harvest(
|
||||
audio.astype('double'),
|
||||
self.sample_rate,
|
||||
f0_floor=self.f0_min,
|
||||
f0_ceil=self.f0_max,
|
||||
frame_period=(1000 * self.hop_size / self.sample_rate))
|
||||
print("[HARVEST-----1111]", f0)
|
||||
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
print("[HARVEST-----1112]", f0)
|
||||
|
||||
# extract f0 using crepe
|
||||
elif self.f0_extractor == 'crepe':
|
||||
if device is None:
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
resample_kernel = self.resample_kernel.to(device)
|
||||
wav16k_torch = resample_kernel(torch.FloatTensor(audio).unsqueeze(0).to(device))
|
||||
|
||||
f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, self.f0_min, self.f0_max, pad=True, model='full',
|
||||
batch_size=512, device=device, return_periodicity=True)
|
||||
pd = median_pool_1d(pd, 4)
|
||||
f0 = torchcrepe.threshold.At(0.05)(f0, pd)
|
||||
f0 = masked_avg_pool_1d(f0, 4)
|
||||
|
||||
f0 = f0.squeeze(0).cpu().numpy()
|
||||
f0 = np.array(
|
||||
[f0[int(min(int(np.round(n * self.hop_size / self.sample_rate / 0.005)), len(f0) - 1))] for n in
|
||||
range(n_frames - start_frame)])
|
||||
f0 = np.pad(f0, (start_frame, 0))
|
||||
|
||||
elif self.f0_extractor == "transformer_f0":
|
||||
if self.transformer_f0 is None:
|
||||
from transformer_f0.model import TransformerF0Infer
|
||||
self.transformer_f0 = TransformerF0Infer(model_path='exp/f0_test_genshin/model_540000.pt')
|
||||
# raw_audio = audio
|
||||
f0 = self.transformer_f0(audio=raw_audio, sr=self.sample_rate)
|
||||
# f0 = f0.transpose(1, 2)
|
||||
# f0 = torch.nn.functional.interpolate(f0, size=int(n_frames), mode='nearest')
|
||||
# f0 = f0.transpose(1, 2)
|
||||
f0 = f0.squeeze().cpu().numpy()
|
||||
# f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
else:
|
||||
raise ValueError(f" [x] Unknown f0 extractor: {self.f0_extractor}")
|
||||
|
||||
# interpolate the unvoiced f0
|
||||
if uv_interp:
|
||||
uv = f0 == 0
|
||||
if len(f0[~uv]) > 0:
|
||||
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
|
||||
f0[f0 < self.f0_min] = self.f0_min
|
||||
|
||||
print("[HARVEST-----1113]", f0)
|
||||
return f0
|
@ -9,10 +9,11 @@ from Exceptions import (
|
||||
NotEnoughDataExtimateF0,
|
||||
)
|
||||
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.F0Extractor import F0_Extractor
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
||||
|
||||
|
||||
@ -48,6 +49,7 @@ class Pipeline(object):
|
||||
self.volumeExtractor = VolumeExtractor(self.hop_size)
|
||||
self.embedder = embedder
|
||||
self.pitchExtractor = pitchExtractor
|
||||
# self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100)
|
||||
|
||||
print("VOLUME EXTRACTOR", self.volumeExtractor)
|
||||
print("GENERATE INFERENCER", self.inferencer)
|
||||
@ -58,6 +60,18 @@ class Pipeline(object):
|
||||
self.device = device
|
||||
self.isHalf = False
|
||||
|
||||
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
||||
f0_extractor = F0_Extractor(
|
||||
f0_extractor=f0_model,
|
||||
sample_rate=44100,
|
||||
hop_size=512,
|
||||
f0_min=f0_min,
|
||||
f0_max=f0_max,
|
||||
block_size=512,
|
||||
model_sampling_rate=44100
|
||||
)
|
||||
return f0_extractor
|
||||
|
||||
def getPipelineInfo(self):
|
||||
volumeExtractorInfo = self.volumeExtractor.getVolumeExtractorInfo()
|
||||
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
||||
@ -95,8 +109,10 @@ class Pipeline(object):
|
||||
|
||||
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
|
||||
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
|
||||
|
||||
# ピッチ検出
|
||||
try:
|
||||
# print("[SRC AUDIO----]", audio_pad)
|
||||
pitch, pitchf = self.pitchExtractor.extract(
|
||||
audio_pad,
|
||||
pitchf,
|
||||
@ -106,7 +122,6 @@ class Pipeline(object):
|
||||
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||
silence_front=silence_front,
|
||||
)
|
||||
print("[Pitch]", pitch)
|
||||
|
||||
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。
|
||||
pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。
|
||||
@ -114,6 +129,9 @@ class Pipeline(object):
|
||||
# print(e)
|
||||
raise NotEnoughDataExtimateF0()
|
||||
|
||||
# f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100)
|
||||
# print("[Pitch_f0]", f0)
|
||||
|
||||
# tensor型調整
|
||||
feats = audio_pad
|
||||
if feats.dim() == 2: # double channels
|
||||
@ -155,13 +173,13 @@ class Pipeline(object):
|
||||
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
|
||||
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
|
||||
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
|
||||
if protect < 0.5:
|
||||
pitchff = pitchf.clone()
|
||||
pitchff[pitchf > 0] = 1
|
||||
pitchff[pitchf < 1] = protect
|
||||
pitchff = pitchff.unsqueeze(-1)
|
||||
feats = feats * pitchff + feats0 * (1 - pitchff)
|
||||
feats = feats.to(feats0.dtype)
|
||||
# if protect < 0.5:
|
||||
# pitchff = pitchf.clone()
|
||||
# pitchff[pitchf > 0] = 1
|
||||
# pitchff[pitchf < 1] = protect
|
||||
# pitchff = pitchff.unsqueeze(-1)
|
||||
# feats = feats * pitchff + feats0 * (1 - pitchff)
|
||||
# feats = feats.to(feats0.dtype)
|
||||
|
||||
# # apply silent front for inference
|
||||
# if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||
@ -176,7 +194,7 @@ class Pipeline(object):
|
||||
torch.clip(
|
||||
self.inferencer.infer(
|
||||
feats,
|
||||
pitch.unsqueeze(-1),
|
||||
pitchf.unsqueeze(-1),
|
||||
volume,
|
||||
mask,
|
||||
sid,
|
||||
|
@ -2,10 +2,10 @@ import traceback
|
||||
from data.ModelSlot import DiffusionSVCModelSlot
|
||||
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
|
||||
|
||||
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
||||
|
@ -0,0 +1,66 @@
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
import onnxruntime
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
|
||||
|
||||
class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||
|
||||
def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
|
||||
self.pitchExtractorType = pitchExtractorType
|
||||
super().__init__()
|
||||
(
|
||||
onnxProviders,
|
||||
onnxProviderOptions,
|
||||
) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
|
||||
|
||||
self.onnx_session = onnxruntime.InferenceSession(
|
||||
file, providers=onnxProviders, provider_options=onnxProviderOptions
|
||||
)
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||
audio = audio[silence_front_offset:]
|
||||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
precision = 10.0
|
||||
|
||||
audio_num = audio.cpu()
|
||||
onnx_f0, onnx_pd = onnxcrepe.predict(
|
||||
self.onnx_session,
|
||||
audio_num,
|
||||
sr,
|
||||
precision=precision,
|
||||
fmin=f0_min,
|
||||
fmax=f0_max,
|
||||
batch_size=256,
|
||||
return_periodicity=True,
|
||||
decoder=onnxcrepe.decode.weighted_argmax,
|
||||
)
|
||||
|
||||
f0 = onnxcrepe.filter.median(onnx_f0, 3)
|
||||
pd = onnxcrepe.filter.median(onnx_pd, 3)
|
||||
|
||||
f0[pd < 0.1] = 0
|
||||
f0 = f0.squeeze()
|
||||
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||
f0bak = pitchf.copy()
|
||||
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
|
||||
f0_mel = np.clip(
|
||||
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
|
||||
)
|
||||
pitch_coarse = f0_mel.astype(int)
|
||||
|
||||
return pitch_coarse, pitchf
|
@ -0,0 +1,59 @@
|
||||
import torchcrepe
|
||||
import torch
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class CrepePitchExtractor(PitchExtractor):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pitchExtractorType: PitchExtractorType = "crepe"
|
||||
if torch.cuda.is_available():
|
||||
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||
else:
|
||||
self.device = torch.device("cpu")
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||
audio = audio[silence_front_offset:]
|
||||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
f0, pd = torchcrepe.predict(
|
||||
audio.unsqueeze(0),
|
||||
sr,
|
||||
hop_length=window,
|
||||
fmin=f0_min,
|
||||
fmax=f0_max,
|
||||
# model="tiny",
|
||||
model="full",
|
||||
batch_size=256,
|
||||
decoder=torchcrepe.decode.weighted_argmax,
|
||||
device=self.device,
|
||||
return_periodicity=True,
|
||||
)
|
||||
f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
|
||||
pd = torchcrepe.filter.median(pd, 3)
|
||||
f0[pd < 0.1] = 0
|
||||
f0 = f0.squeeze()
|
||||
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]]
|
||||
f0bak = pitchf.copy()
|
||||
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
|
||||
f0_mel = np.clip(
|
||||
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
|
||||
)
|
||||
pitch_coarse = f0_mel.astype(int)
|
||||
|
||||
return pitch_coarse, pitchf
|
@ -0,0 +1,49 @@
|
||||
import pyworld
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class DioPitchExtractor(PitchExtractor):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pitchExtractorType: PitchExtractorType = "dio"
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
silence_front_offset = max(min(int(np.round(real_silence_front * sr)), len(audio) - 3000), 0)
|
||||
audio = audio[silence_front_offset:]
|
||||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
_f0, t = pyworld.dio(
|
||||
audio.astype(np.double),
|
||||
sr,
|
||||
f0_floor=f0_min,
|
||||
f0_ceil=f0_max,
|
||||
channels_in_octave=2,
|
||||
frame_period=10,
|
||||
)
|
||||
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
|
||||
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||
f0bak = pitchf.copy()
|
||||
f0_mel = 1127 * np.log(1 + f0bak / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||
f0_mel_max - f0_mel_min
|
||||
) + 1
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > 255] = 255
|
||||
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||
|
||||
return pitch_coarse, pitchf
|
@ -0,0 +1,112 @@
|
||||
import pyworld
|
||||
import numpy as np
|
||||
import scipy.signal as signal
|
||||
from const import PitchExtractorType
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class HarvestPitchExtractor(PitchExtractor):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pitchExtractorType: PitchExtractorType = "harvest"
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
# silence_front_offset = int(np.round(real_silence_front * sr))
|
||||
# audio = audio[silence_front_offset:]
|
||||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
f0 = self.extract2(audio, uv_interp=True, hop_size=window, silence_front=silence_front)
|
||||
f0 = f0 * 2 ** (float(f0_up_key) / 12)
|
||||
pitchf = f0
|
||||
|
||||
# f0, t = pyworld.harvest(
|
||||
# audio.astype(np.double),
|
||||
# fs=sr,
|
||||
# f0_ceil=f0_max,
|
||||
# frame_period=10,
|
||||
# )
|
||||
# f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr)
|
||||
# f0 = signal.medfilt(f0, 3)
|
||||
|
||||
# f0 *= pow(2, f0_up_key / 12)
|
||||
# pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||
f0bak = pitchf.copy()
|
||||
f0_mel = 1127 * np.log(1 + f0bak / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||
f0_mel_max - f0_mel_min
|
||||
) + 1
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > 255] = 255
|
||||
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||
|
||||
return pitch_coarse, pitchf
|
||||
|
||||
def extract2(self, audio, uv_interp, hop_size: int, silence_front=0): # audio: 1d numpy array
|
||||
n_frames = int(len(audio) // hop_size) + 1
|
||||
|
||||
start_frame = int(silence_front * 16000 / hop_size)
|
||||
real_silence_front = start_frame * hop_size / 16000
|
||||
audio = audio[int(np.round(real_silence_front * 16000)):]
|
||||
|
||||
f0, _ = pyworld.harvest(
|
||||
audio.astype('double'),
|
||||
16000,
|
||||
f0_floor=50,
|
||||
f0_ceil=1100,
|
||||
frame_period=(1000 * hop_size / 16000))
|
||||
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
|
||||
if uv_interp:
|
||||
uv = f0 == 0
|
||||
if len(f0[~uv]) > 0:
|
||||
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
|
||||
f0[f0 < 50] = 50
|
||||
|
||||
return f0
|
||||
|
||||
def extract_old(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||
audio = audio[silence_front_offset:]
|
||||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
f0, t = pyworld.harvest(
|
||||
audio.astype(np.double),
|
||||
fs=sr,
|
||||
f0_ceil=f0_max,
|
||||
frame_period=10,
|
||||
)
|
||||
f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr)
|
||||
f0 = signal.medfilt(f0, 3)
|
||||
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||
f0bak = pitchf.copy()
|
||||
f0_mel = 1127 * np.log(1 + f0bak / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||
f0_mel_max - f0_mel_min
|
||||
) + 1
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > 255] = 255
|
||||
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||
|
||||
return pitch_coarse, pitchf
|
@ -0,0 +1,12 @@
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
class PitchExtractor(Protocol):
|
||||
|
||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||
...
|
||||
|
||||
def getPitchExtractorInfo(self):
|
||||
return {
|
||||
"pitchExtractorType": self.pitchExtractorType,
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
from typing import Protocol
|
||||
from const import PitchExtractorType
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
|
||||
|
||||
class PitchExtractorManager(Protocol):
|
||||
currentPitchExtractor: PitchExtractor | None = None
|
||||
params: VoiceChangerParams
|
||||
|
||||
@classmethod
|
||||
def initialize(cls, params: VoiceChangerParams):
|
||||
cls.params = params
|
||||
|
||||
@classmethod
|
||||
def getPitchExtractor(
|
||||
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||
) -> PitchExtractor:
|
||||
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType, gpu)
|
||||
return cls.currentPitchExtractor
|
||||
|
||||
@classmethod
|
||||
def loadPitchExtractor(
|
||||
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||
) -> PitchExtractor:
|
||||
if pitchExtractorType == "harvest":
|
||||
return HarvestPitchExtractor()
|
||||
# elif pitchExtractorType == "dio":
|
||||
# return DioPitchExtractor()
|
||||
# elif pitchExtractorType == "crepe":
|
||||
# return CrepePitchExtractor()
|
||||
# elif pitchExtractorType == "crepe_tiny":
|
||||
# return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
|
||||
# elif pitchExtractorType == "crepe_full":
|
||||
# return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
|
||||
else:
|
||||
# return hubert as default
|
||||
raise RuntimeError(
|
||||
"[Voice Changer] PitchExctractor not found", pitchExtractorType
|
||||
)
|
@ -255,6 +255,7 @@ class VoiceChanger:
|
||||
# 後処理
|
||||
with Timer("post-process") as t:
|
||||
result = result.astype(np.int16)
|
||||
|
||||
if self.settings.outputSampleRate != processing_sampling_rate:
|
||||
# print(
|
||||
# "output samplingrate",
|
||||
@ -291,6 +292,7 @@ class VoiceChanger:
|
||||
|
||||
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
||||
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
||||
|
||||
return outputData, perf
|
||||
|
||||
except NoModeLoadedException as e:
|
||||
|
Loading…
x
Reference in New Issue
Block a user