silence front for RVC

This commit is contained in:
wataru 2023-04-19 07:57:19 +09:00
parent 3ad903fad6
commit 6f87ca7a39
8 changed files with 88 additions and 11 deletions

View File

@ -168,6 +168,10 @@
{
"name": "rvcQuality",
"options": {}
},
{
"name": "silenceFront",
"options": {}
}
]
},

File diff suppressed because one or more lines are too long

View File

@ -168,6 +168,10 @@
{
"name": "rvcQuality",
"options": {}
},
{
"name": "silenceFront",
"options": {}
}
]
},

View File

@ -40,6 +40,7 @@ import { ModelSamplingRateRow, ModelSamplingRateRowProps } from "./components/30
import { SolaEnableRow, SolaEnableRowProps } from "./components/811_SolaEnableRow"
import { EnableEnhancerRow, EnableEnhancerRowProps } from "./components/610_EnableEnhancerRow"
import { DstIdRow2, DstIdRow2Props } from "./components/602v2_DstIdRow2"
import { SilenceFrontRow, SilenceFrontRowProps } from "./components/812_SilenceFrontRow"
export const catalog: { [key: string]: (props: any) => JSX.Element } = {}
@ -107,6 +108,7 @@ const initialize = () => {
addToCatalog("trancateNumThreshold", (props: TrancateNumTresholdRowProps) => { return <TrancateNumTresholdRow {...props} /> })
addToCatalog("rvcQuality", (props: RVCQualityRowProps) => { return <RVCQualityRow {...props} /> })
addToCatalog("solaEnable", (props: SolaEnableRowProps) => { return <SolaEnableRow {...props} /> })
addToCatalog("silenceFront", (props: SilenceFrontRowProps) => { return <SilenceFrontRow {...props} /> })

View File

@ -0,0 +1,31 @@
import React, { useMemo } from "react"
import { useAppState } from "../../../001_provider/001_AppStateProvider"
export type SilenceFrontRowProps = {
}
export const SilenceFrontRow = (_props: SilenceFrontRowProps) => {
const appState = useAppState()
const trancateNumTresholdRow = useMemo(() => {
const onSilenceFrontChanged = (val: number) => {
appState.serverSetting.updateServerSettings({
...appState.serverSetting.serverSetting,
silenceFront: val
})
}
return (
<div className="body-row split-3-7 left-padding-1 guided">
<div className="body-item-title left-padding-1">Silence Front</div>
<div className="body-input-container">
<select value={appState.serverSetting.serverSetting.silenceFront} onChange={(e) => { onSilenceFrontChanged(Number(e.target.value)) }}>
<option value="0" >off</option>
<option value="1" >on</option>
</select>
</div>
</div>
)
}, [appState.serverSetting.serverSetting, appState.serverSetting.updateServerSettings])
return trancateNumTresholdRow
}

View File

@ -93,6 +93,7 @@ export const ServerSettingKey = {
"indexRatio": "indexRatio",
"rvcQuality": "rvcQuality",
"modelSamplingRate": "modelSamplingRate",
"silenceFront": "silenceFront",
"enableEnhancer": "enableEnhancer",
"enhancerTune": "enhancerTune",
@ -128,6 +129,7 @@ export type VoiceChangerServerSetting = {
indexRatio: number // RVC
rvcQuality: number // 0:low, 1:high
silenceFront: number // 0:off, 1:on
modelSamplingRate: ModelSamplingRate // 32000,40000,48000
enableEnhancer: number // DDSP-SVC
@ -173,6 +175,7 @@ export const DefaultServerSetting_MMVCv15: ServerInfo = {
indexRatio: 0,
rvcQuality: 0,
modelSamplingRate: 48000,
silenceFront: 0,
enableEnhancer: 0,
enhancerTune: 0,
@ -213,6 +216,7 @@ export const DefaultServerSetting_MMVCv13: ServerInfo = {
indexRatio: 0,
rvcQuality: 0,
modelSamplingRate: 48000,
silenceFront: 0,
enableEnhancer: 0,
enhancerTune: 0,
@ -257,6 +261,7 @@ export const DefaultServerSetting_so_vits_svc_40: ServerInfo = {
indexRatio: 0,
rvcQuality: 0,
modelSamplingRate: 48000,
silenceFront: 0,
enableEnhancer: 0,
enhancerTune: 0,
@ -301,6 +306,7 @@ export const DefaultServerSetting_so_vits_svc_40_c: ServerInfo = {
indexRatio: 0,
rvcQuality: 0,
modelSamplingRate: 48000,
silenceFront: 0,
enableEnhancer: 0,
enhancerTune: 0,
@ -344,6 +350,7 @@ export const DefaultServerSetting_so_vits_svc_40v2: ServerInfo = {
indexRatio: 0,
rvcQuality: 0,
modelSamplingRate: 48000,
silenceFront: 0,
enableEnhancer: 0,
enhancerTune: 0,
@ -388,6 +395,7 @@ export const DefaultServerSetting_DDSP_SVC: ServerInfo = {
indexRatio: 0,
rvcQuality: 0,
modelSamplingRate: 48000,
silenceFront: 1,
enableEnhancer: 0,
enhancerTune: 0,
@ -433,6 +441,7 @@ export const DefaultServerSetting_RVC: ServerInfo = {
indexRatio: 0,
rvcQuality: 0,
modelSamplingRate: 48000,
silenceFront: 1,
enableEnhancer: 0,
enhancerTune: 0,

View File

@ -54,6 +54,7 @@ class RVCSettings():
indexRatio: float = 0
rvcQuality: int = 0
silenceFront: int = 1 # 0:off, 1:on
modelSamplingRate: int = 48000
speakers: dict[str, int] = field(
@ -61,7 +62,7 @@ class RVCSettings():
)
# ↓mutableな物だけ列挙
intData = ["gpu", "dstId", "tran", "predictF0", "extraConvertSize", "rvcQuality", "modelSamplingRate"]
intData = ["gpu", "dstId", "tran", "predictF0", "extraConvertSize", "rvcQuality", "modelSamplingRate", "silenceFront"]
floatData = ["noiceScale", "silentThreshold", "indexRatio"]
strData = ["framework", "f0Detector"]
@ -272,8 +273,13 @@ class RVC:
if_f0 = 1
f0_file = None
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file)
if self.settings.silenceFront == 0:
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=0)
else:
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate)
result = audio_out * np.sqrt(vol)
return result

View File

@ -25,14 +25,21 @@ class VC(object):
self.device = device
self.is_half = is_half
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
def get_f0(self, audio, p_len, f0_up_key, f0_method, inp_f0=None, silence_front=0):
n_frames = int(len(audio) // self.window) + 1
start_frame = int(silence_front * self.sr / self.window)
real_silence_front = start_frame * self.window / self.sr
audio = audio[int(np.round(real_silence_front * self.sr)):]
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if (f0_method == "pm"):
f0 = parselmouth.Sound(x, self.sr).to_pitch_ac(
f0 = parselmouth.Sound(audio, self.sr).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
pad_size = (p_len - len(f0) + 1) // 2
@ -40,13 +47,16 @@ class VC(object):
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
elif (f0_method == "harvest"):
f0, t = pyworld.harvest(
x.astype(np.double),
audio.astype(np.double),
fs=self.sr,
f0_ceil=f0_max,
frame_period=10,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = pyworld.stonemask(audio.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3)
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数
@ -117,7 +127,7 @@ class VC(object):
times[2] += (t2 - t1)
return audio1
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None):
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0):
if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
try:
index = faiss.read_index(file_index)
@ -138,7 +148,7 @@ class VC(object):
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
pitch, pitchf = None, None
if (if_f0 == 1):
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0, silence_front=silence_front)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()