silence front for RVC
This commit is contained in:
parent
3ad903fad6
commit
6f87ca7a39
@ -168,6 +168,10 @@
|
||||
{
|
||||
"name": "rvcQuality",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "silenceFront",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
15
client/demo/dist/index.js
vendored
15
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
@ -168,6 +168,10 @@
|
||||
{
|
||||
"name": "rvcQuality",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "silenceFront",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
@ -40,6 +40,7 @@ import { ModelSamplingRateRow, ModelSamplingRateRowProps } from "./components/30
|
||||
import { SolaEnableRow, SolaEnableRowProps } from "./components/811_SolaEnableRow"
|
||||
import { EnableEnhancerRow, EnableEnhancerRowProps } from "./components/610_EnableEnhancerRow"
|
||||
import { DstIdRow2, DstIdRow2Props } from "./components/602v2_DstIdRow2"
|
||||
import { SilenceFrontRow, SilenceFrontRowProps } from "./components/812_SilenceFrontRow"
|
||||
|
||||
export const catalog: { [key: string]: (props: any) => JSX.Element } = {}
|
||||
|
||||
@ -107,6 +108,7 @@ const initialize = () => {
|
||||
addToCatalog("trancateNumThreshold", (props: TrancateNumTresholdRowProps) => { return <TrancateNumTresholdRow {...props} /> })
|
||||
addToCatalog("rvcQuality", (props: RVCQualityRowProps) => { return <RVCQualityRow {...props} /> })
|
||||
addToCatalog("solaEnable", (props: SolaEnableRowProps) => { return <SolaEnableRow {...props} /> })
|
||||
addToCatalog("silenceFront", (props: SilenceFrontRowProps) => { return <SilenceFrontRow {...props} /> })
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,31 @@
|
||||
import React, { useMemo } from "react"
|
||||
import { useAppState } from "../../../001_provider/001_AppStateProvider"
|
||||
|
||||
export type SilenceFrontRowProps = {
|
||||
}
|
||||
|
||||
export const SilenceFrontRow = (_props: SilenceFrontRowProps) => {
|
||||
const appState = useAppState()
|
||||
|
||||
const trancateNumTresholdRow = useMemo(() => {
|
||||
const onSilenceFrontChanged = (val: number) => {
|
||||
appState.serverSetting.updateServerSettings({
|
||||
...appState.serverSetting.serverSetting,
|
||||
silenceFront: val
|
||||
})
|
||||
}
|
||||
return (
|
||||
<div className="body-row split-3-7 left-padding-1 guided">
|
||||
<div className="body-item-title left-padding-1">Silence Front</div>
|
||||
<div className="body-input-container">
|
||||
<select value={appState.serverSetting.serverSetting.silenceFront} onChange={(e) => { onSilenceFrontChanged(Number(e.target.value)) }}>
|
||||
<option value="0" >off</option>
|
||||
<option value="1" >on</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}, [appState.serverSetting.serverSetting, appState.serverSetting.updateServerSettings])
|
||||
|
||||
return trancateNumTresholdRow
|
||||
}
|
@ -93,6 +93,7 @@ export const ServerSettingKey = {
|
||||
"indexRatio": "indexRatio",
|
||||
"rvcQuality": "rvcQuality",
|
||||
"modelSamplingRate": "modelSamplingRate",
|
||||
"silenceFront": "silenceFront",
|
||||
|
||||
"enableEnhancer": "enableEnhancer",
|
||||
"enhancerTune": "enhancerTune",
|
||||
@ -128,6 +129,7 @@ export type VoiceChangerServerSetting = {
|
||||
|
||||
indexRatio: number // RVC
|
||||
rvcQuality: number // 0:low, 1:high
|
||||
silenceFront: number // 0:off, 1:on
|
||||
modelSamplingRate: ModelSamplingRate // 32000,40000,48000
|
||||
|
||||
enableEnhancer: number // DDSP-SVC
|
||||
@ -173,6 +175,7 @@ export const DefaultServerSetting_MMVCv15: ServerInfo = {
|
||||
indexRatio: 0,
|
||||
rvcQuality: 0,
|
||||
modelSamplingRate: 48000,
|
||||
silenceFront: 0,
|
||||
|
||||
enableEnhancer: 0,
|
||||
enhancerTune: 0,
|
||||
@ -213,6 +216,7 @@ export const DefaultServerSetting_MMVCv13: ServerInfo = {
|
||||
indexRatio: 0,
|
||||
rvcQuality: 0,
|
||||
modelSamplingRate: 48000,
|
||||
silenceFront: 0,
|
||||
|
||||
enableEnhancer: 0,
|
||||
enhancerTune: 0,
|
||||
@ -257,6 +261,7 @@ export const DefaultServerSetting_so_vits_svc_40: ServerInfo = {
|
||||
indexRatio: 0,
|
||||
rvcQuality: 0,
|
||||
modelSamplingRate: 48000,
|
||||
silenceFront: 0,
|
||||
|
||||
enableEnhancer: 0,
|
||||
enhancerTune: 0,
|
||||
@ -301,6 +306,7 @@ export const DefaultServerSetting_so_vits_svc_40_c: ServerInfo = {
|
||||
indexRatio: 0,
|
||||
rvcQuality: 0,
|
||||
modelSamplingRate: 48000,
|
||||
silenceFront: 0,
|
||||
|
||||
enableEnhancer: 0,
|
||||
enhancerTune: 0,
|
||||
@ -344,6 +350,7 @@ export const DefaultServerSetting_so_vits_svc_40v2: ServerInfo = {
|
||||
indexRatio: 0,
|
||||
rvcQuality: 0,
|
||||
modelSamplingRate: 48000,
|
||||
silenceFront: 0,
|
||||
|
||||
enableEnhancer: 0,
|
||||
enhancerTune: 0,
|
||||
@ -388,6 +395,7 @@ export const DefaultServerSetting_DDSP_SVC: ServerInfo = {
|
||||
indexRatio: 0,
|
||||
rvcQuality: 0,
|
||||
modelSamplingRate: 48000,
|
||||
silenceFront: 1,
|
||||
|
||||
enableEnhancer: 0,
|
||||
enhancerTune: 0,
|
||||
@ -433,6 +441,7 @@ export const DefaultServerSetting_RVC: ServerInfo = {
|
||||
indexRatio: 0,
|
||||
rvcQuality: 0,
|
||||
modelSamplingRate: 48000,
|
||||
silenceFront: 1,
|
||||
|
||||
enableEnhancer: 0,
|
||||
enhancerTune: 0,
|
||||
|
@ -54,6 +54,7 @@ class RVCSettings():
|
||||
|
||||
indexRatio: float = 0
|
||||
rvcQuality: int = 0
|
||||
silenceFront: int = 1 # 0:off, 1:on
|
||||
modelSamplingRate: int = 48000
|
||||
|
||||
speakers: dict[str, int] = field(
|
||||
@ -61,7 +62,7 @@ class RVCSettings():
|
||||
)
|
||||
|
||||
# ↓mutableな物だけ列挙
|
||||
intData = ["gpu", "dstId", "tran", "predictF0", "extraConvertSize", "rvcQuality", "modelSamplingRate"]
|
||||
intData = ["gpu", "dstId", "tran", "predictF0", "extraConvertSize", "rvcQuality", "modelSamplingRate", "silenceFront"]
|
||||
floatData = ["noiceScale", "silentThreshold", "indexRatio"]
|
||||
strData = ["framework", "f0Detector"]
|
||||
|
||||
@ -272,8 +273,13 @@ class RVC:
|
||||
if_f0 = 1
|
||||
f0_file = None
|
||||
|
||||
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
|
||||
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file)
|
||||
if self.settings.silenceFront == 0:
|
||||
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
|
||||
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=0)
|
||||
else:
|
||||
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
|
||||
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate)
|
||||
|
||||
result = audio_out * np.sqrt(vol)
|
||||
|
||||
return result
|
||||
|
@ -25,14 +25,21 @@ class VC(object):
|
||||
self.device = device
|
||||
self.is_half = is_half
|
||||
|
||||
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
|
||||
def get_f0(self, audio, p_len, f0_up_key, f0_method, inp_f0=None, silence_front=0):
|
||||
|
||||
n_frames = int(len(audio) // self.window) + 1
|
||||
start_frame = int(silence_front * self.sr / self.window)
|
||||
real_silence_front = start_frame * self.window / self.sr
|
||||
|
||||
audio = audio[int(np.round(real_silence_front * self.sr)):]
|
||||
|
||||
time_step = self.window / self.sr * 1000
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
if (f0_method == "pm"):
|
||||
f0 = parselmouth.Sound(x, self.sr).to_pitch_ac(
|
||||
f0 = parselmouth.Sound(audio, self.sr).to_pitch_ac(
|
||||
time_step=time_step / 1000, voicing_threshold=0.6,
|
||||
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
|
||||
pad_size = (p_len - len(f0) + 1) // 2
|
||||
@ -40,13 +47,16 @@ class VC(object):
|
||||
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
|
||||
elif (f0_method == "harvest"):
|
||||
f0, t = pyworld.harvest(
|
||||
x.astype(np.double),
|
||||
audio.astype(np.double),
|
||||
fs=self.sr,
|
||||
f0_ceil=f0_max,
|
||||
frame_period=10,
|
||||
)
|
||||
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
||||
f0 = pyworld.stonemask(audio.astype(np.double), f0, t, self.sr)
|
||||
f0 = signal.medfilt(f0, 3)
|
||||
|
||||
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
||||
tf0 = self.sr // self.window # 每秒f0点数
|
||||
@ -117,7 +127,7 @@ class VC(object):
|
||||
times[2] += (t2 - t1)
|
||||
return audio1
|
||||
|
||||
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None):
|
||||
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0):
|
||||
if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
|
||||
try:
|
||||
index = faiss.read_index(file_index)
|
||||
@ -138,7 +148,7 @@ class VC(object):
|
||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||
pitch, pitchf = None, None
|
||||
if (if_f0 == 1):
|
||||
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
|
||||
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0, silence_front=silence_front)
|
||||
pitch = pitch[:p_len]
|
||||
pitchf = pitchf[:p_len]
|
||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||
|
Loading…
x
Reference in New Issue
Block a user