WIP: first communication confirmation
This commit is contained in:
parent
fbabfed566
commit
a1db94c0af
@ -6,15 +6,15 @@ import torch
|
|||||||
import torchaudio
|
import torchaudio
|
||||||
from data.ModelSlot import DiffusionSVCModelSlot
|
from data.ModelSlot import DiffusionSVCModelSlot
|
||||||
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
||||||
|
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||||
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
||||||
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||||
|
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
||||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
|
||||||
|
|
||||||
from Exceptions import DeviceCannotSupportHalfPrecisionException
|
from Exceptions import DeviceCannotSupportHalfPrecisionException
|
||||||
|
|
||||||
@ -165,8 +165,6 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||||
result = audio_out.detach().cpu().numpy()
|
result = audio_out.detach().cpu().numpy()
|
||||||
|
|
||||||
print("RESULT", result)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||||
print("[Device Manager] Device cannot support half precision. Fallback to float....")
|
print("[Device Manager] Device cannot support half precision. Fallback to float....")
|
||||||
|
@ -0,0 +1,169 @@
|
|||||||
|
|
||||||
|
from torchaudio.transforms import Resample
|
||||||
|
import pyworld as pw
|
||||||
|
import numpy as np
|
||||||
|
import torchcrepe
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
CREPE_RESAMPLE_KERNEL = {}
|
||||||
|
|
||||||
|
|
||||||
|
def median_pool_1d(x, kernel_size):
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
|
||||||
|
x = x.squeeze(1)
|
||||||
|
x = x.unfold(1, kernel_size, 1)
|
||||||
|
x, _ = torch.sort(x, dim=-1)
|
||||||
|
return x[:, :, (kernel_size - 1) // 2]
|
||||||
|
|
||||||
|
|
||||||
|
def masked_avg_pool_1d(x, kernel_size):
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
|
||||||
|
mask = ~torch.isnan(x)
|
||||||
|
masked_x = torch.where(mask, x, torch.zeros_like(x))
|
||||||
|
ones_kernel = torch.ones(x.size(1), 1, kernel_size, device=x.device)
|
||||||
|
|
||||||
|
# Perform sum pooling
|
||||||
|
sum_pooled = F.conv1d(
|
||||||
|
masked_x,
|
||||||
|
ones_kernel,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
groups=x.size(1),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count the non-masked (valid) elements in each pooling window
|
||||||
|
valid_count = F.conv1d(
|
||||||
|
mask.float(),
|
||||||
|
ones_kernel,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
groups=x.size(1),
|
||||||
|
)
|
||||||
|
valid_count = valid_count.clamp(min=1) # Avoid division by zero
|
||||||
|
|
||||||
|
# Perform masked average pooling
|
||||||
|
avg_pooled = sum_pooled / valid_count
|
||||||
|
|
||||||
|
return avg_pooled.squeeze(1)
|
||||||
|
|
||||||
|
|
||||||
|
class F0_Extractor:
|
||||||
|
def __init__(self, f0_extractor, sample_rate=44100, hop_size=512, f0_min=65, f0_max=800,
|
||||||
|
block_size=None, model_sampling_rate=None):
|
||||||
|
self.block_size = block_size
|
||||||
|
self.model_sampling_rate = model_sampling_rate
|
||||||
|
self.f0_extractor = f0_extractor
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.hop_size = hop_size
|
||||||
|
self.f0_min = f0_min
|
||||||
|
self.f0_max = f0_max
|
||||||
|
self.transformer_f0 = None
|
||||||
|
if f0_extractor == 'crepe':
|
||||||
|
key_str = str(sample_rate)
|
||||||
|
if key_str not in CREPE_RESAMPLE_KERNEL:
|
||||||
|
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
|
||||||
|
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
|
||||||
|
if (self.block_size is not None) or (self.model_sampling_rate is not None):
|
||||||
|
assert (self.block_size is not None) and (self.model_sampling_rate is not None)
|
||||||
|
self.hop_size_follow_input = True
|
||||||
|
else:
|
||||||
|
self.hop_size_follow_input = False
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def extract_f0(self, audio, key=0, sr=44100, silence_front=0):
|
||||||
|
f0 = self.extract(audio.cpu().numpy(), uv_interp=True, silence_front=silence_front, sr=sr)
|
||||||
|
# f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||||
|
print("[PITCH_F0_ORG1]", f0)
|
||||||
|
f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0)
|
||||||
|
f0 = f0 * 2 ** (float(key) / 12)
|
||||||
|
print("[PITCH_F0_ORG2]", f0)
|
||||||
|
return f0
|
||||||
|
|
||||||
|
def extract(self, audio, uv_interp=False, device=None, silence_front=0, sr=None): # audio: 1d numpy array
|
||||||
|
if sr is not None:
|
||||||
|
assert self.hop_size_follow_input
|
||||||
|
self.hop_size = self.block_size * sr / self.model_sampling_rate
|
||||||
|
if (self.f0_extractor == 'crepe') and (sr != self.sample_rate):
|
||||||
|
key_str = str(sr)
|
||||||
|
if key_str not in CREPE_RESAMPLE_KERNEL:
|
||||||
|
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sr, 16000, lowpass_filter_width=128)
|
||||||
|
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
|
||||||
|
self.sample_rate = sr
|
||||||
|
|
||||||
|
# extractor start time
|
||||||
|
raw_audio = audio
|
||||||
|
n_frames = int(len(audio) // self.hop_size) + 1
|
||||||
|
|
||||||
|
start_frame = int(silence_front * self.sample_rate / self.hop_size)
|
||||||
|
real_silence_front = start_frame * self.hop_size / self.sample_rate
|
||||||
|
audio = audio[int(np.round(real_silence_front * self.sample_rate)):]
|
||||||
|
|
||||||
|
if self.f0_extractor == 'dio':
|
||||||
|
_f0, t = pw.dio(
|
||||||
|
audio.astype('double'),
|
||||||
|
self.sample_rate,
|
||||||
|
f0_floor=self.f0_min,
|
||||||
|
f0_ceil=self.f0_max,
|
||||||
|
channels_in_octave=2,
|
||||||
|
frame_period=(1000 * self.hop_size / self.sample_rate))
|
||||||
|
f0 = pw.stonemask(audio.astype('double'), _f0, t, self.sample_rate)
|
||||||
|
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||||
|
|
||||||
|
# extract f0 using harvest
|
||||||
|
elif self.f0_extractor == 'harvest':
|
||||||
|
print("[SRC AUDIO2]", audio[:10])
|
||||||
|
print("_____hopsize______", (1000 * self.hop_size / self.sample_rate), self.sample_rate)
|
||||||
|
f0, _ = pw.harvest(
|
||||||
|
audio.astype('double'),
|
||||||
|
self.sample_rate,
|
||||||
|
f0_floor=self.f0_min,
|
||||||
|
f0_ceil=self.f0_max,
|
||||||
|
frame_period=(1000 * self.hop_size / self.sample_rate))
|
||||||
|
print("[HARVEST-----1111]", f0)
|
||||||
|
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||||
|
print("[HARVEST-----1112]", f0)
|
||||||
|
|
||||||
|
# extract f0 using crepe
|
||||||
|
elif self.f0_extractor == 'crepe':
|
||||||
|
if device is None:
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
resample_kernel = self.resample_kernel.to(device)
|
||||||
|
wav16k_torch = resample_kernel(torch.FloatTensor(audio).unsqueeze(0).to(device))
|
||||||
|
|
||||||
|
f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, self.f0_min, self.f0_max, pad=True, model='full',
|
||||||
|
batch_size=512, device=device, return_periodicity=True)
|
||||||
|
pd = median_pool_1d(pd, 4)
|
||||||
|
f0 = torchcrepe.threshold.At(0.05)(f0, pd)
|
||||||
|
f0 = masked_avg_pool_1d(f0, 4)
|
||||||
|
|
||||||
|
f0 = f0.squeeze(0).cpu().numpy()
|
||||||
|
f0 = np.array(
|
||||||
|
[f0[int(min(int(np.round(n * self.hop_size / self.sample_rate / 0.005)), len(f0) - 1))] for n in
|
||||||
|
range(n_frames - start_frame)])
|
||||||
|
f0 = np.pad(f0, (start_frame, 0))
|
||||||
|
|
||||||
|
elif self.f0_extractor == "transformer_f0":
|
||||||
|
if self.transformer_f0 is None:
|
||||||
|
from transformer_f0.model import TransformerF0Infer
|
||||||
|
self.transformer_f0 = TransformerF0Infer(model_path='exp/f0_test_genshin/model_540000.pt')
|
||||||
|
# raw_audio = audio
|
||||||
|
f0 = self.transformer_f0(audio=raw_audio, sr=self.sample_rate)
|
||||||
|
# f0 = f0.transpose(1, 2)
|
||||||
|
# f0 = torch.nn.functional.interpolate(f0, size=int(n_frames), mode='nearest')
|
||||||
|
# f0 = f0.transpose(1, 2)
|
||||||
|
f0 = f0.squeeze().cpu().numpy()
|
||||||
|
# f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||||
|
else:
|
||||||
|
raise ValueError(f" [x] Unknown f0 extractor: {self.f0_extractor}")
|
||||||
|
|
||||||
|
# interpolate the unvoiced f0
|
||||||
|
if uv_interp:
|
||||||
|
uv = f0 == 0
|
||||||
|
if len(f0[~uv]) > 0:
|
||||||
|
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
|
||||||
|
f0[f0 < self.f0_min] = self.f0_min
|
||||||
|
|
||||||
|
print("[HARVEST-----1113]", f0)
|
||||||
|
return f0
|
@ -9,10 +9,11 @@ from Exceptions import (
|
|||||||
NotEnoughDataExtimateF0,
|
NotEnoughDataExtimateF0,
|
||||||
)
|
)
|
||||||
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.F0Extractor import F0_Extractor
|
||||||
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
|
||||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||||
|
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
|
||||||
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
||||||
|
|
||||||
|
|
||||||
@ -48,6 +49,7 @@ class Pipeline(object):
|
|||||||
self.volumeExtractor = VolumeExtractor(self.hop_size)
|
self.volumeExtractor = VolumeExtractor(self.hop_size)
|
||||||
self.embedder = embedder
|
self.embedder = embedder
|
||||||
self.pitchExtractor = pitchExtractor
|
self.pitchExtractor = pitchExtractor
|
||||||
|
# self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100)
|
||||||
|
|
||||||
print("VOLUME EXTRACTOR", self.volumeExtractor)
|
print("VOLUME EXTRACTOR", self.volumeExtractor)
|
||||||
print("GENERATE INFERENCER", self.inferencer)
|
print("GENERATE INFERENCER", self.inferencer)
|
||||||
@ -58,6 +60,18 @@ class Pipeline(object):
|
|||||||
self.device = device
|
self.device = device
|
||||||
self.isHalf = False
|
self.isHalf = False
|
||||||
|
|
||||||
|
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
||||||
|
f0_extractor = F0_Extractor(
|
||||||
|
f0_extractor=f0_model,
|
||||||
|
sample_rate=44100,
|
||||||
|
hop_size=512,
|
||||||
|
f0_min=f0_min,
|
||||||
|
f0_max=f0_max,
|
||||||
|
block_size=512,
|
||||||
|
model_sampling_rate=44100
|
||||||
|
)
|
||||||
|
return f0_extractor
|
||||||
|
|
||||||
def getPipelineInfo(self):
|
def getPipelineInfo(self):
|
||||||
volumeExtractorInfo = self.volumeExtractor.getVolumeExtractorInfo()
|
volumeExtractorInfo = self.volumeExtractor.getVolumeExtractorInfo()
|
||||||
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
||||||
@ -95,8 +109,10 @@ class Pipeline(object):
|
|||||||
|
|
||||||
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
|
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
|
||||||
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
|
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
|
||||||
|
|
||||||
# ピッチ検出
|
# ピッチ検出
|
||||||
try:
|
try:
|
||||||
|
# print("[SRC AUDIO----]", audio_pad)
|
||||||
pitch, pitchf = self.pitchExtractor.extract(
|
pitch, pitchf = self.pitchExtractor.extract(
|
||||||
audio_pad,
|
audio_pad,
|
||||||
pitchf,
|
pitchf,
|
||||||
@ -106,7 +122,6 @@ class Pipeline(object):
|
|||||||
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||||
silence_front=silence_front,
|
silence_front=silence_front,
|
||||||
)
|
)
|
||||||
print("[Pitch]", pitch)
|
|
||||||
|
|
||||||
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。
|
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。
|
||||||
pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。
|
pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。
|
||||||
@ -114,6 +129,9 @@ class Pipeline(object):
|
|||||||
# print(e)
|
# print(e)
|
||||||
raise NotEnoughDataExtimateF0()
|
raise NotEnoughDataExtimateF0()
|
||||||
|
|
||||||
|
# f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100)
|
||||||
|
# print("[Pitch_f0]", f0)
|
||||||
|
|
||||||
# tensor型調整
|
# tensor型調整
|
||||||
feats = audio_pad
|
feats = audio_pad
|
||||||
if feats.dim() == 2: # double channels
|
if feats.dim() == 2: # double channels
|
||||||
@ -155,13 +173,13 @@ class Pipeline(object):
|
|||||||
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
|
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
|
||||||
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
|
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
|
||||||
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
|
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
|
||||||
if protect < 0.5:
|
# if protect < 0.5:
|
||||||
pitchff = pitchf.clone()
|
# pitchff = pitchf.clone()
|
||||||
pitchff[pitchf > 0] = 1
|
# pitchff[pitchf > 0] = 1
|
||||||
pitchff[pitchf < 1] = protect
|
# pitchff[pitchf < 1] = protect
|
||||||
pitchff = pitchff.unsqueeze(-1)
|
# pitchff = pitchff.unsqueeze(-1)
|
||||||
feats = feats * pitchff + feats0 * (1 - pitchff)
|
# feats = feats * pitchff + feats0 * (1 - pitchff)
|
||||||
feats = feats.to(feats0.dtype)
|
# feats = feats.to(feats0.dtype)
|
||||||
|
|
||||||
# # apply silent front for inference
|
# # apply silent front for inference
|
||||||
# if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
# if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||||
@ -176,7 +194,7 @@ class Pipeline(object):
|
|||||||
torch.clip(
|
torch.clip(
|
||||||
self.inferencer.infer(
|
self.inferencer.infer(
|
||||||
feats,
|
feats,
|
||||||
pitch.unsqueeze(-1),
|
pitchf.unsqueeze(-1),
|
||||||
volume,
|
volume,
|
||||||
mask,
|
mask,
|
||||||
sid,
|
sid,
|
||||||
|
@ -2,10 +2,10 @@ import traceback
|
|||||||
from data.ModelSlot import DiffusionSVCModelSlot
|
from data.ModelSlot import DiffusionSVCModelSlot
|
||||||
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
||||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||||
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||||
|
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
|
||||||
|
|
||||||
|
|
||||||
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
||||||
|
@ -0,0 +1,66 @@
|
|||||||
|
import numpy as np
|
||||||
|
from const import PitchExtractorType
|
||||||
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
import onnxruntime
|
||||||
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
|
||||||
|
|
||||||
|
class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||||
|
|
||||||
|
def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
|
||||||
|
self.pitchExtractorType = pitchExtractorType
|
||||||
|
super().__init__()
|
||||||
|
(
|
||||||
|
onnxProviders,
|
||||||
|
onnxProviderOptions,
|
||||||
|
) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
|
||||||
|
|
||||||
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
|
file, providers=onnxProviders, provider_options=onnxProviderOptions
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
|
n_frames = int(len(audio) // window) + 1
|
||||||
|
start_frame = int(silence_front * sr / window)
|
||||||
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||||
|
audio = audio[silence_front_offset:]
|
||||||
|
|
||||||
|
f0_min = 50
|
||||||
|
f0_max = 1100
|
||||||
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||||
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
|
|
||||||
|
precision = 10.0
|
||||||
|
|
||||||
|
audio_num = audio.cpu()
|
||||||
|
onnx_f0, onnx_pd = onnxcrepe.predict(
|
||||||
|
self.onnx_session,
|
||||||
|
audio_num,
|
||||||
|
sr,
|
||||||
|
precision=precision,
|
||||||
|
fmin=f0_min,
|
||||||
|
fmax=f0_max,
|
||||||
|
batch_size=256,
|
||||||
|
return_periodicity=True,
|
||||||
|
decoder=onnxcrepe.decode.weighted_argmax,
|
||||||
|
)
|
||||||
|
|
||||||
|
f0 = onnxcrepe.filter.median(onnx_f0, 3)
|
||||||
|
pd = onnxcrepe.filter.median(onnx_pd, 3)
|
||||||
|
|
||||||
|
f0[pd < 0.1] = 0
|
||||||
|
f0 = f0.squeeze()
|
||||||
|
|
||||||
|
f0 *= pow(2, f0_up_key / 12)
|
||||||
|
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||||
|
f0bak = pitchf.copy()
|
||||||
|
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
|
||||||
|
f0_mel = np.clip(
|
||||||
|
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
|
||||||
|
)
|
||||||
|
pitch_coarse = f0_mel.astype(int)
|
||||||
|
|
||||||
|
return pitch_coarse, pitchf
|
@ -0,0 +1,59 @@
|
|||||||
|
import torchcrepe
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from const import PitchExtractorType
|
||||||
|
|
||||||
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class CrepePitchExtractor(PitchExtractor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.pitchExtractorType: PitchExtractorType = "crepe"
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||||
|
else:
|
||||||
|
self.device = torch.device("cpu")
|
||||||
|
|
||||||
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
|
n_frames = int(len(audio) // window) + 1
|
||||||
|
start_frame = int(silence_front * sr / window)
|
||||||
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||||
|
audio = audio[silence_front_offset:]
|
||||||
|
|
||||||
|
f0_min = 50
|
||||||
|
f0_max = 1100
|
||||||
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||||
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
|
|
||||||
|
f0, pd = torchcrepe.predict(
|
||||||
|
audio.unsqueeze(0),
|
||||||
|
sr,
|
||||||
|
hop_length=window,
|
||||||
|
fmin=f0_min,
|
||||||
|
fmax=f0_max,
|
||||||
|
# model="tiny",
|
||||||
|
model="full",
|
||||||
|
batch_size=256,
|
||||||
|
decoder=torchcrepe.decode.weighted_argmax,
|
||||||
|
device=self.device,
|
||||||
|
return_periodicity=True,
|
||||||
|
)
|
||||||
|
f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
|
||||||
|
pd = torchcrepe.filter.median(pd, 3)
|
||||||
|
f0[pd < 0.1] = 0
|
||||||
|
f0 = f0.squeeze()
|
||||||
|
|
||||||
|
f0 *= pow(2, f0_up_key / 12)
|
||||||
|
pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]]
|
||||||
|
f0bak = pitchf.copy()
|
||||||
|
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
|
||||||
|
f0_mel = np.clip(
|
||||||
|
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
|
||||||
|
)
|
||||||
|
pitch_coarse = f0_mel.astype(int)
|
||||||
|
|
||||||
|
return pitch_coarse, pitchf
|
@ -0,0 +1,49 @@
|
|||||||
|
import pyworld
|
||||||
|
import numpy as np
|
||||||
|
from const import PitchExtractorType
|
||||||
|
|
||||||
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class DioPitchExtractor(PitchExtractor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.pitchExtractorType: PitchExtractorType = "dio"
|
||||||
|
|
||||||
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
|
audio = audio.detach().cpu().numpy()
|
||||||
|
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||||
|
start_frame = int(silence_front * sr / window)
|
||||||
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
silence_front_offset = max(min(int(np.round(real_silence_front * sr)), len(audio) - 3000), 0)
|
||||||
|
audio = audio[silence_front_offset:]
|
||||||
|
|
||||||
|
f0_min = 50
|
||||||
|
f0_max = 1100
|
||||||
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||||
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
|
|
||||||
|
_f0, t = pyworld.dio(
|
||||||
|
audio.astype(np.double),
|
||||||
|
sr,
|
||||||
|
f0_floor=f0_min,
|
||||||
|
f0_ceil=f0_max,
|
||||||
|
channels_in_octave=2,
|
||||||
|
frame_period=10,
|
||||||
|
)
|
||||||
|
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
|
||||||
|
|
||||||
|
f0 *= pow(2, f0_up_key / 12)
|
||||||
|
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||||
|
f0bak = pitchf.copy()
|
||||||
|
f0_mel = 1127 * np.log(1 + f0bak / 700)
|
||||||
|
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||||
|
f0_mel_max - f0_mel_min
|
||||||
|
) + 1
|
||||||
|
f0_mel[f0_mel <= 1] = 1
|
||||||
|
f0_mel[f0_mel > 255] = 255
|
||||||
|
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||||
|
|
||||||
|
return pitch_coarse, pitchf
|
@ -0,0 +1,112 @@
|
|||||||
|
import pyworld
|
||||||
|
import numpy as np
|
||||||
|
import scipy.signal as signal
|
||||||
|
from const import PitchExtractorType
|
||||||
|
|
||||||
|
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class HarvestPitchExtractor(PitchExtractor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.pitchExtractorType: PitchExtractorType = "harvest"
|
||||||
|
|
||||||
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
|
audio = audio.detach().cpu().numpy()
|
||||||
|
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||||
|
start_frame = int(silence_front * sr / window)
|
||||||
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
# silence_front_offset = int(np.round(real_silence_front * sr))
|
||||||
|
# audio = audio[silence_front_offset:]
|
||||||
|
|
||||||
|
f0_min = 50
|
||||||
|
f0_max = 1100
|
||||||
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||||
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
|
|
||||||
|
f0 = self.extract2(audio, uv_interp=True, hop_size=window, silence_front=silence_front)
|
||||||
|
f0 = f0 * 2 ** (float(f0_up_key) / 12)
|
||||||
|
pitchf = f0
|
||||||
|
|
||||||
|
# f0, t = pyworld.harvest(
|
||||||
|
# audio.astype(np.double),
|
||||||
|
# fs=sr,
|
||||||
|
# f0_ceil=f0_max,
|
||||||
|
# frame_period=10,
|
||||||
|
# )
|
||||||
|
# f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr)
|
||||||
|
# f0 = signal.medfilt(f0, 3)
|
||||||
|
|
||||||
|
# f0 *= pow(2, f0_up_key / 12)
|
||||||
|
# pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||||
|
f0bak = pitchf.copy()
|
||||||
|
f0_mel = 1127 * np.log(1 + f0bak / 700)
|
||||||
|
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||||
|
f0_mel_max - f0_mel_min
|
||||||
|
) + 1
|
||||||
|
f0_mel[f0_mel <= 1] = 1
|
||||||
|
f0_mel[f0_mel > 255] = 255
|
||||||
|
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||||
|
|
||||||
|
return pitch_coarse, pitchf
|
||||||
|
|
||||||
|
def extract2(self, audio, uv_interp, hop_size: int, silence_front=0): # audio: 1d numpy array
|
||||||
|
n_frames = int(len(audio) // hop_size) + 1
|
||||||
|
|
||||||
|
start_frame = int(silence_front * 16000 / hop_size)
|
||||||
|
real_silence_front = start_frame * hop_size / 16000
|
||||||
|
audio = audio[int(np.round(real_silence_front * 16000)):]
|
||||||
|
|
||||||
|
f0, _ = pyworld.harvest(
|
||||||
|
audio.astype('double'),
|
||||||
|
16000,
|
||||||
|
f0_floor=50,
|
||||||
|
f0_ceil=1100,
|
||||||
|
frame_period=(1000 * hop_size / 16000))
|
||||||
|
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||||
|
|
||||||
|
if uv_interp:
|
||||||
|
uv = f0 == 0
|
||||||
|
if len(f0[~uv]) > 0:
|
||||||
|
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
|
||||||
|
f0[f0 < 50] = 50
|
||||||
|
|
||||||
|
return f0
|
||||||
|
|
||||||
|
def extract_old(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
|
audio = audio.detach().cpu().numpy()
|
||||||
|
n_frames = int(len(audio) // window) + 1 # NOQA
|
||||||
|
start_frame = int(silence_front * sr / window)
|
||||||
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
silence_front_offset = int(np.round(real_silence_front * sr))
|
||||||
|
audio = audio[silence_front_offset:]
|
||||||
|
|
||||||
|
f0_min = 50
|
||||||
|
f0_max = 1100
|
||||||
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||||
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
|
|
||||||
|
f0, t = pyworld.harvest(
|
||||||
|
audio.astype(np.double),
|
||||||
|
fs=sr,
|
||||||
|
f0_ceil=f0_max,
|
||||||
|
frame_period=10,
|
||||||
|
)
|
||||||
|
f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr)
|
||||||
|
f0 = signal.medfilt(f0, 3)
|
||||||
|
|
||||||
|
f0 *= pow(2, f0_up_key / 12)
|
||||||
|
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
|
||||||
|
f0bak = pitchf.copy()
|
||||||
|
f0_mel = 1127 * np.log(1 + f0bak / 700)
|
||||||
|
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||||
|
f0_mel_max - f0_mel_min
|
||||||
|
) + 1
|
||||||
|
f0_mel[f0_mel <= 1] = 1
|
||||||
|
f0_mel[f0_mel > 255] = 255
|
||||||
|
pitch_coarse = np.rint(f0_mel).astype(int)
|
||||||
|
|
||||||
|
return pitch_coarse, pitchf
|
@ -0,0 +1,12 @@
|
|||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
|
||||||
|
class PitchExtractor(Protocol):
|
||||||
|
|
||||||
|
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||||
|
...
|
||||||
|
|
||||||
|
def getPitchExtractorInfo(self):
|
||||||
|
return {
|
||||||
|
"pitchExtractorType": self.pitchExtractorType,
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
from typing import Protocol
|
||||||
|
from const import PitchExtractorType
|
||||||
|
from voice_changer.DiffusionSVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
|
||||||
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
|
|
||||||
|
|
||||||
|
class PitchExtractorManager(Protocol):
|
||||||
|
currentPitchExtractor: PitchExtractor | None = None
|
||||||
|
params: VoiceChangerParams
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def initialize(cls, params: VoiceChangerParams):
|
||||||
|
cls.params = params
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getPitchExtractor(
|
||||||
|
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||||
|
) -> PitchExtractor:
|
||||||
|
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType, gpu)
|
||||||
|
return cls.currentPitchExtractor
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def loadPitchExtractor(
|
||||||
|
cls, pitchExtractorType: PitchExtractorType, gpu: int
|
||||||
|
) -> PitchExtractor:
|
||||||
|
if pitchExtractorType == "harvest":
|
||||||
|
return HarvestPitchExtractor()
|
||||||
|
# elif pitchExtractorType == "dio":
|
||||||
|
# return DioPitchExtractor()
|
||||||
|
# elif pitchExtractorType == "crepe":
|
||||||
|
# return CrepePitchExtractor()
|
||||||
|
# elif pitchExtractorType == "crepe_tiny":
|
||||||
|
# return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
|
||||||
|
# elif pitchExtractorType == "crepe_full":
|
||||||
|
# return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
|
||||||
|
else:
|
||||||
|
# return hubert as default
|
||||||
|
raise RuntimeError(
|
||||||
|
"[Voice Changer] PitchExctractor not found", pitchExtractorType
|
||||||
|
)
|
@ -255,6 +255,7 @@ class VoiceChanger:
|
|||||||
# 後処理
|
# 後処理
|
||||||
with Timer("post-process") as t:
|
with Timer("post-process") as t:
|
||||||
result = result.astype(np.int16)
|
result = result.astype(np.int16)
|
||||||
|
|
||||||
if self.settings.outputSampleRate != processing_sampling_rate:
|
if self.settings.outputSampleRate != processing_sampling_rate:
|
||||||
# print(
|
# print(
|
||||||
# "output samplingrate",
|
# "output samplingrate",
|
||||||
@ -291,6 +292,7 @@ class VoiceChanger:
|
|||||||
|
|
||||||
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
||||||
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
||||||
|
|
||||||
return outputData, perf
|
return outputData, perf
|
||||||
|
|
||||||
except NoModeLoadedException as e:
|
except NoModeLoadedException as e:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user