From 75d7ecb5b55ed302d25e5afc18be2bcad3c79c79 Mon Sep 17 00:00:00 2001 From: Whale and Dolphin Date: Tue, 3 Jun 2025 21:05:14 +0800 Subject: [PATCH] Optimize documents (#994) * [feature]add dataset classs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [dev]combine agent and tts infer * [feature]:update inference * [feature]:update uv.lock * [Merge]:merge upstream/main * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [fix]:remove unused files * [fix]:remove unused files * [fix]:remove unused files * [fix]:fix infer bugs * [docs]:update introduction and optinize front appearence * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/assets/openaudio.jpg | Bin 0 -> 40444 bytes docs/assets/openaudio.png | Bin 0 -> 956 bytes docs/en/index.md | 135 +++++++++++++++++++++++++++++++------- docs/en/inference.md | 7 +- docs/en/install.md | 31 +++++++++ docs/ja/index.md | 134 ++++++++++++++++++++++++++++++------- docs/ja/inference.md | 11 +--- docs/ja/install.md | 30 +++++++++ docs/ko/index.md | 134 ++++++++++++++++++++++++++++++------- docs/ko/inference.md | 11 +--- docs/ko/install.md | 30 +++++++++ docs/pt/index.md | 134 ++++++++++++++++++++++++++++++------- docs/pt/inference.md | 11 +--- docs/pt/install.md | 30 +++++++++ docs/zh/index.md | 134 ++++++++++++++++++++++++++++++------- docs/zh/inference.md | 25 +++---- docs/zh/install.md | 30 +++++++++ mkdocs.yml | 22 ++++--- 18 files changed, 727 insertions(+), 182 deletions(-) create mode 100644 docs/assets/openaudio.jpg create mode 100644 docs/assets/openaudio.png create mode 100644 docs/en/install.md create mode 100644 docs/ja/install.md create mode 100644 docs/ko/install.md create mode 100644 docs/pt/install.md create mode 100644 docs/zh/install.md diff --git a/docs/assets/openaudio.jpg b/docs/assets/openaudio.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f23b7aaf00ee83baff6cc466b6845cb890e89a20 GIT binary patch literal 40444 zcmeEu2Ut_xmi7s~BM^EOrHhDwbO<6{q=fQSYV0Rcq?q(}`BrT31C zNKcd^O*%*-Ekc0*c+1=}cfRkNdFHwQ%-k<<5}s4S*=N6d?X}+Zu6_9Fa0)nLtZ$?b zKp+4B0e^tQF+dlfq@eiu3%;npUursPYAPyf23lGgIwl4tCPoHEMrIcFBg`ypER2jt zxQ?)KKsh-%nOM1bxS%}jP)_L2FM&{k*HBT@Q&ZDJnHia(|LNCZ3&2K4l}q)C5+Vvv zut6xzbJYJE^Z!PJ~43#NhxVXr8CMZs%pCD z^z;o3jf^i^U9z^Zy=>=l?YgU*yN9QLK;W&Q;E>ShJ27`-@7+fyr#wncOMm<%BPTa6 z|7Ag8QSs|H6_r)hHMMnZ?H!$6-EZH$9~>GU!QnrB9-W?<{W>?lu(-5L{JycdwY@|7 zvHP=K5Pv*5*s#BA7aQ0v3Mwi}D%zjzf=~p38zmbR^-*~mb{z}atG*nfrz7d0 zXOmu(x6&U|uq1Fg`wcK~9ao$dBmQjKA6oX?8W#0mYT3Uv>@V#a1(+xyVCPY?0onk$ zI0r2W{J#DE41UMJ?-=+U1HWV7cMSZFf!{IkI|hEo!0#CN9Rt5(;CBrCj)C7X@UMn} zQ|>@cyagJN%!xOL=<(#B1S9pSPfAGwEx8nEbA^6B;7Ye@E+-~0*v1Nf-0F2j$o-{A zr}XJV;IqMu!Mz;z%kF-Lm_d|A@L=&m6=IoL_<(+dm}1^_2rwHz*}@PH0jdwA>gxG{ z%L>cxlwX;co(;6@U<)eEND^jUw*@=H6u*@$B0g%E?42U(5kjsO2P56jY)$pkK4KEs z&O63=+kDVF;PWW|V<%qjC?4mcD_AJ-$U+Z*{wr@+hGr9lJv zKEir_9t;yJS>Y(Pt_yE}BS1O#%mzcU-5n@_Xo?+8R#1*AYNmkkuAe$5QCts0}ZWGJ$ zPD4|2Byrk~Mz8+mU5%>pD%A<}{_Ao=W@3TMYF!xDr=__v8LP9L6h$g)FyY5e>|k@$ zmt$qSeUOR3$?z>xI2YJC3@zxAcUqi zA!xcFc8fG2SMm1bNZoyNzLlsoJ5M>M9F&$ZX#xDdM1pYVZKJ%3%2>miW2*P%dy#=* z!QdU<2=194$Oxwx^s?+dtt)K2|Ca9Z{dbguB~G74C);p4<|l~CmwX?(Q~p5VHIyT! zd)~<^@mp6tVWGQvS?_`w=id6>xq~(>MpEZ>e?`9VrYvMjs=K+I2L9_Iu*R{+9^5>Y zy4BI_n!I3YX z><7JIvkw(DBN-Y$EWO@AAtNNl1G+~uWSfF5whJicLzvx}@l^34r1 z9B>^9w6}WVXvoy_%;pMrN=E#nv%%S{?T5fv^tNC#NxRe$QK!gyCS8X6N?eP;1{`X_ zn{e+K&lG-hjwn3+EuARS7PKo~Kw-l9;X38%uSM84_?XT1mEi;G;CVk;_}!m=(PLcPgTC-W_S-YDA zQreniM*{9Y%EP7bjI+h_Eg)9RNRqLwx3PC4c(w%fW<;6of}Yq5u$YEDg)#qYxpCBh zNARsL+r%#PO~GSn?<0>b;vP>t87SWr zZO%=+YaMZg6!F4-R>-mVq&5SvY}06DVbDKZ0a)lkf750ubT8HO} zWr#fz9=~th%p5WGI8BPitw8%9 zd>&+17MvpU2`#lm$jJLcpg1+s9dV0VRPN>snBMS^S=$GHE8Jbp6nQ!_ek?}oo z)UBB7LA2vAoFF#Dm!o(G&DQk~5_qgYmlZF!3 z6In9(RbQ6L|_K_B$9Pt3w<69s{6kdWrCZM@7i6>@`q-688P z>t_wkmz_h`>zI(7CEJ;y{WNV3E{*5YAD1fCX>42+!#4t6;n**Um3ugY(Z(g~h0F>_ zrQjq_ulthPlZt1jRLZWN3}3!_3kGrk{i7^-8L7!?XACKo3I-bzrx(!3UXjQ#5hAY9 zS(U1gM<5!;PI&>^DR8md7_j@5?%~x3Ptc+Teg8tT*1;&rBM>X?eWu(?LQoAq^VA>S zk6l~IWwzh;BOYgX`aB)G0PlNQ;CW-c;Lah?b4CWnDVg(=DEYs?`NgAm)<0#LDr@rx z+`sp}tNjN)(x4wdlJzR{(Q#r2cg-F)zMpo^zv?8S_xdY?eWv}Sh)%AxbB?@T7bUuxKp(|1&mp`F#hejM`?q!`XbR8JFKxn za(tc1?Z>X;(LZ|S&L%SlWYQ3I-RrV{G+8S`wxpkJn3e8}?kh1$E}Uhki?)fJ>Zh>8 zh8ka5hq3&Ey9;48_EOn=*-x8>9H>*pBRLV)kwF|B0!W(X=W_SbCY^~*wKVc`<8`w{ z>{3KmUGQ+2Qc+9$(QKqPCA@c`8>6w5#T|BuR22~jrwPoV|8t1Zm*k}Oojpz+~BziGEpV1iUeC z3&IOSaYyR=LM;LeLV;cQRAK^h(({Hy%4rEo@y0l9*)aPlR`#ZZ85xggFUJ;T&5~8Y zW0w-GD$wqqj=Tq*?;I(cMltD~Y}#M)mSXeIJPW<(PQke1t6BQC{b+xYumv2IMKEu^ z+e_G+bvOj-r3ANsbPFO~;LYC!-&;4I0Ld>boSEy}a(9PsXD6m|~>+_x>Vi6Cmqmm-koLymvRu?3*eeRc~$& z#dN9T)Fz0yJo|>7Z05(_?dTt_l0qF zO7G+epnt_${M1K6;&U^8a2zmrObuWUdPBc13H``*Cw6Oh!kzdwvPNlJi)JTdMse{g z%^%{Mv{3!D8>Sk}UNZWV!2*1VoFido{nrSS30nzTR+9Y=YLROx4uz`oC_X(gs)1ki zSpK)ObL?Y@lOkJJ$t#=uQ}6b)qe_8Uc{F+dHGf%-@EUdl+kC)_oNXQT+4qA&7Aa;0 zm(SJAVq8uv;PNhm9^eg39|)70E`*?)R>fS_7u1=wue-x>0pSQvZ_tpOd>~2b>y5nF zr#M%4-C?Vg2iq_{Ti%ISMO=-5hd@p8E!pEb*H9l9WG4`%MRehd>)8v26Fq zQ|ckb$tOx1r<*#ji#8iHPnCV4y-E}sx{TJ)uUgKFT*5W0&eFxuoL*4VaWW2++7)ASlzNHuA(V({%VA?LY$HKm0F{5epKg!}O3ozQ6Du zvBFb!_O0p~T%OEc&5jUt2>-aX$v*M0Jp-8JZpcXTQ7NTvT(QKgLm>?*3&#f*(zGSs z*_MR0ToI~Vg)>CBnX!{$@~@(7WuG3{pPE6bHdB#I%!z6B2rm!sjV@aXjz(k_8A@{W zm=b0fZ6jbu(NRVM=!FBCgO6(3Z3(AETTfO}KsI1|m8*O1kWM|iQ>MU0R41Tm0Ii4{ zO#2IeCnfoIE1Nn8eI55hYYb$`-B!t2>(<_PtO%4(Du zs_7KM`ePG3OZ`J}I3wRde8v&tX=QzEmmEpPtX+9fOGXdVDl~Shn*TA20!%n7;9md; zp&$c6N3Os9gw=9nMSOv!0iPZw9Y5xAV?x;`z?<&+JnBr-@cTTMW4K2S73G||(fiBq zt7}8VT2v_uvBCS#wO#k!W?An|XBavOa`JG61!l8a?mvofbt7&*T&a6@{)P^f{_GZZ z0M0HXiLa3IXy}MGKswpe@aCO#p@Hpfi&p6^`Hnr21bY!nSje@0I zC}3Uz`I+$fb6iqt1`SRdY8!Rf*?B{EHQvqN8t($<02^R}JfR(}#RgwGgZDxdI<}oZ zU%*TvE$}?c1@08$kmx)ph5kEXSJ<0jI`Q zc538l@CKxjGMz8T=VIHH{U4_VZE%N7ZNT3M8fDX)f3=G1AC`{>Lo6WhA5mQ(O`gh@ z4S9+t2_G=@O+yOFyeXH?^&Br&K<|A@ig#@S9qFjJ#jG*-o2nOdjvdCV8`-Qae9|BD zZ+OXlJE{To)3(F>Uyh;T%_$0dPrkR}6h#BuW;DnZ!-T}QkA)KgM_HZ+&B$77YMY@= z?mr_~xz?;RAFs9x#5Cy2iH}&U!;750YjGT#*rwpk2n?~4s}YsXLss-LfZz%u|7v!O02j2hQt$bPHG!=P?esD3M( zE|qa#`ck}P!RAYIfaWKI5&ow>Wj;xL>Vu|-7;o>T>lzy{-b_WJS8Q~HdPvD~5XI>G z&p{e@W|~xAwZ&Ykuw?#~Eo%L>1ispJ9y z*XB@dm-SH>W767`j1+a-^9xkA6O3Wql$Z!jnPL23^Q)aTNFc#}COq$r!|A0IHjbk{ zkJv-|X*b!+pbtG^EqL{5;Ui3y zM1g)PF*?uh+N0_1vmIFsfhJ+z@*V4m+1RU^h)g0btuE_Mw#wM5jD#Wr+fzmNO`(g#x?J zgq>Kd(%vU8FU;4B%RJKe@Jfr#Z}6+LGw=roVpVSj)`aXMPSIauY~jUziujB_px>D~ z1Y*vd!DqxY1np+G-duV35~abqExI3bQX;CSqv=$YydDi7-DSVmtWJo{T!X>2ia1G( ztf=jk`)6|`TPR8|e`>$(CD_=2U~ae~t)B}pJQHUI3>&39XpQauj+RTQ1oYdZdxLk@ zWQmEUBvrzs)o^~TPFOhyRIk>%a`B5IQK=w0Os;T6d5~^xP4u(^A6aa_ve^^kGOPUn zTd{c3QSs?zhL^bMSVhS6BsWqFPQE~F!eX4x4>CS65pn5vts9gZ!e#37MKTXf`QLg1`zfFi^gs@l%=S@tP74~7m@UU6d2Y=m1!MawRRj?~ zGD(_518dUdnK4a-aqaDpqYvdj`6gxTZ|tXNsk9PTrcEC-mA-V}Zd=I=qW;=XTN7>O zl*+zcWRnlb{=HQe2lLA`oW-MgTF56sYzHb&B2IbEoza`%XSIL{qaq0AhXAE8b=b|? zm}8#>TAwk6W*q{KVdd~5VS+U_#<%sRhS#ff$0|~HPf8X6+Y>#AI`-!Y?ElOsO2MQG zOw~*I!ss1++Oe^9-5prA%Zt03C4Cyb@x=?A<%EK%>U&6W(%p8mp&Qp8jvE5b1@1z5 z_%-3xh&>MXNvbeS=7Sd=oVeQ4cOTU)T;0xQ&iYic?N&`fz51s17ka#zjh1%L{=X&n0RQWcxU=vFE#82x^HCbo}vl;w*?Vt zmON0pV+;c5QP>q>)}|v9OY*(19qkmjAPbF1&hhi>zB2MBE+0=BiK!1!S`>t}L0ADNiRp2v`>k zcIvhm`eq%FEOydrf%yTV{B1V9WZv6sfqsgCze{~uS`NU31wd!Rl*}G>oj@~pEH3QT z2Z1A+3h)^2@(^VXi2%RmB$J!2k#*@Y=1j@w@?&S6Fcy;E)T~wpQ54zn=D?pcNq-Z$8iw-O zUFbr|7?0`ev36-mNBQL)UMn9{#lF_|=!?JAOs6HVT)pw} z)ki{gQcBbX<_N7rfKwylHt|?jU0G$2!n^JJ;GC}63;5yj7d;gkKMZ=`S~hy8WOhrZ z&(1%a@!@Aq-oNeBk}r7|SPW=KbDg$C`)L;q$Q(i2)%67HicAi?&*cidi2nDg3r17^ z@bb{`8Ms}i#(6$f=&55hO61ny2zYJ3l8O4FKo@S~cs7=|6+6-QNqalqEJ2v5f+X<- zZe0y~O%Wn3Nx9rIZW*27&I2v^f;#T>=x?L9KMhlb%G(y|kAn}Q1z@}v1=yqIoaI<%A@7mr!Iji5U`#`(an+iyUuAhjwt&&)qJx(V8<&Rv z-4lNFv{3O8u!_zvMR3&#J=a_tk5UnJ{Lob(i)%6hd-Z;pt2VP1N5z>P{*yjBiFBV- zvRV5kNs^%O(!`&r+;uV7wd?D1t@jSu`m>ybB@Mpn;${Ek-PDo&;k%dT{W>3i-72Qq z%BD#fw*0#g9hk|xXfcohWQjJt*pJb6?@5e;bI9smu*U5t(^v*OWc+}OW0t=6`l$7A zKvU_Y+sm_Am$}pk5?uWuW$u+a1nwqH?*7uS zGTVLUC^5FZ*Y)EymSfd!=K3L^G_8fY@_U((M)(kVusTi#J%9^Hxjtj6KXI4oV%?GO z`vs$)zq?bag|jg0vnL*rz%?v%3NdG$f0Bnj?W`4@f7>ztrTQh@pmS4v=ZP!PU;_7t z@53k^9?=*F%8YGcfiS64v2|~p)TS7|KkZSKb<9mQe!YtZa`pI4d45@I_dk;!rBno* zEFEUHk)oGL{(bG^Js_N}AU@p7|o22bQS*4$!zEe0x4Z48GBM!wBfR7aMb6hv9+ zr1T<_4%#CKW_R%!_|DAJudg0CI&=3@D^*OpV=~ zyEZR9aJycmx2nqX%tZj{2amIv4gsdcv`oGs(ILzWwvK23VsFwq^ZZTbqp4Hwkd?u- zq|kp)TvD1{G%^kawZoE{%uu1&4B$nn=gq}XN8Ag)Y1y^e)9T7FY9wz&tDd&|Jn0Sb z^p5e+H>QkBxzh~FG))9>#KawpSRVo`WctADMf`H0Ooh96f_*Bj66wY5+p_x~@Eo>`$mDMf=5ulBdk**raG6UtNDiZ)YHj|t*i$~ z0o!07>pJEVyO;F0K*xGQWXkeeVfEf%Nnkd;dmcCm{Ecz=l5jcm*qgmuv4uMEKVH4I zj#N=BjWR@M>hOSK+{w|~t?g~XoRuc0>XCl?n!O>Z4YzGHS62E+l8KRdLkAK@CB@EC zRwrRIO_$I^%xjS_>eoqjX^UpK-0I48^l(*WHY3+VntP@1dD!~#ltjvDQG~fspgYQt z=4MW#n!4XA>yq8ASBh@mz6pY(E;cZnW#wF5u(yk(PPbCVN7linN)nb(CdcvVoj-!d z9;JX%$M@_`-j&;Aj72ZgI2M#iQi`G_LHt>mR3-m~nEVyF#Q%2{3#5}r@r446RX53+ zqlZmktGESVRhkStN-`-6vmr9^d8c}A&|YDro-(J=j%;8q3Dd!g3~t8R)Yc@SM3p{L zBL|V~@N%bALP1-(r;y0dObz}xq@%2MYz;lQA9n~`ea=5+oxqh}YS$iwRf}b>`dgxtuWE3vJex4kAtFOQjM+@VS^QX+w_0<{WhBXDLN zy@TXNFZSk-b|p(*Y(e_NgvH3Vq};MFGotLrf|3c1c>}RaPgUk?lC>OW@=j8^sph8z zUT+1h1@1qvcGOOr)2%PDp>}`+d1Nt*8Y#2f%s_@CifzZ=@s&g^V^I2tkLmfwi50g& z9wlT?k?dSaY9tJqj5t5JRx3$W%wnj5b-38i+JT>yBXcNhKlPy^y&_$-s?)ySB&woY z4d7Qh>*-~40@b4>Z}d~Y08I#T?Qhxbvy-8D@H&q>{$`YnFk$|PRs#*`>Sh*^=5sxz zEy>hKZ#~*??=%rxPq*=dhd{*v^zG9iFXxdT4zg$Zq?6jpp}?4ZSjRK#=cTloQn>Be z_f9hUI8d!_itwqY+QJx4Gc-%~fZS}}hMl>+yvl_NV1?iIEA37GedcQ`lZ*jAPy>Vil|Yyr5k3`j2;4d8T3Z#FS9v_^Q2^)H7PeHGD48wV zON6zTwO7Sb$7MucPqC}nfQoy;CN_s)b*IeRl9w^IU)P|J*}TA)d=GcF3g2c^{cnhn z!NBkARyMtr)zjdYSmT|(&nep`M1$^ij!_-z(j@neXxUE!5SWk`p?VsOFAO@YcW)&Y z^aRzm_!ZxzLF_1R>+L5uXcJ7@(v-bC1nPzhUn`c@%4Zp`Xi*&W2pu6Qx^Q-ry1(IK zkf*+OaQ9X=iz}TU(CJ|}<{F3a&A?SwjY>jfj~t`_oD`J6;Ip-*OJw%cw3w{AwS}K&|ZN;Y6QpisLHYDYTv~Za-AW^ zF#c4|DBj;L-2Ug6t(=X|kg~egPn9FWYNuz~fjWKLGSAzdA?E%j=O%o$bx$I=dS*Q? zU`hM+L;$8W)4+j3k}_iZ?0)=K^Ml>3%DIu!v^wZ{jg_bdzWs8NI>8M_(w%-9E}>WP zfQ=6rlt$SOhKZ1T^21ILOw*EFJRj0IGakLvH9MZMo~sBDS@Tt$J2hGK)KGjS=)nJ= zZ0BbHuiM{GNuM{Nlc&iHNId_9Z`CGPf=v4XCq5>h@bO8&(Gl0ZX|vu}_z`VBgh?M3yp8{8g)p80y`4p5S}jG=bOxq zOq-C{Yoc{5qHyVTDK15P?z(C=^RS=48Lbi@-@|~PY1n_u%Te$@*6U!`4?Vr6^do)G zl5}lJm;-yyw0J@VI}|p>N5XD@COqhQTWMvsRgi)Peo){hGw{{X4^jmin>A(ZzD&+- zWz&upsR))>eqj%_y0!g`(hm1(+7}HZ4{A0*>|W_cKD%_=BqF2g3hIbTHofdc&-qHP zW9wgXDk{Dh2r!I9jR_V)2>Pa;_Aid{^hK}pKFzR}y;7lheM7jT>9ufHjc zzs*fsfj`TXBY5Xetyv-j&rZw)I8(BLjD`$?`^m7}_<9C0Vw7>)d;lrsky6=U(HdTn zDql#I0g!Kv>9?b|*3r#xg5r=o6XW647t&YXPM(PL?`{myJvk6Xr?m;OL|gwX5d0sP zv}i35d0Rwr&72B$Y3)p#*V$!tgl+tg;Z06`qr%ly@#g1OqQsM}MSxtT-N-$Ph*s_C zVXqSn*Jl>DU$l>Ptn4gOrtja4n958EetFQ-*`!qIj`8|rLS4dC+CFZx+#85=V;Qk8oxW4pOre@FaTH_^cN8S zTax)7@4K`D;}|uSZCDajHG<9=&T{U8o(xsfev!n;j1N7+CL{@l2G(%R63Q$J!vR?P z{dA2$3Dm*q;E0QRI_NJ_(MHiIeRs-TtvmHxeG>;97hkk2VEmd{!p>`4nwC|!(jQNk zmyr@SUfPKAhp|ko!*1*p?oeL1;7;LGVJY{gybOXV+DT&1EZ7`zzz{+)_o62N>qV>7 zL)IO-n?EgRC*Up4)d?V-I^C)+2QPM9+B0mr$?PB97F!7-oh_t(vw%@A< zx{Apv7=*iKAqB!+VYK2(;yi;?)kDzpe_lRO(@*_W-7BZ!MbuLSm!8Mim|G>_5**#a zryN(BslLZkwxe(U;~q28?BFx}^+zmWw$q01<&!Jjx~XwKdV8@$LaBSgHy#Tx6wJV5 z>$E#G8CFb2){<@KEVExPKq2IX3op-HzbLvSG`@o9~qbhF$*jUMt&|B{BC>4 zValdM*_x*Hiz-{%HI?gEqnH7Uob=kfY>JG~j6lULFk<`nWX(T+pT3lpIpr+x+co%Y|M<|y@$H>cX!eI=fI?*SB0$F=*JPN zGq|_xiWVpU>}B@RqXO=1v=CTJ5o`vvEHKeJqar(@=9C0@HK*&~ z;^zy3YUmg-iqrDFXHHX;D?;|bdVH$I{Z}`_e5YmP=n%@ra-{=Ts>ef=cG|pdwyW-7 z=i@-Y)MLT0g)N}bn|D-l>Y)!XC4|`JA~6zA56!r>KbN?4_j;Pg73+u5pf+y6a8%y1 z(>ya((F!8=hlHW!t!yey=1Yy$+}49_lehFnJ^MBI;xi8(L4N!owzn0WBi#>sai%NO zM$E-$!$q)UF>5rpJyOnV3wq)D4Gt&H>s~PSF+1PE3fVPfp+}1%AE`a zvk2i<|Lmgd)$#BWR~(CGAw(9_Sc6eVM0m<#_DigBcpQpra7eQTV~8QOKM`>x60_=Z92VX5}u2kH=%eXQ9U!s%l-KftGGj zl=c&0v^G;^Rt!?QzJII4pZSACnTlx5?*7F7Ymf~H=b!bC3o2gPUgD}`Pe0qht+9pe zVnu@?I@y4b;#f0OSub&~YYqgr0H|`OG}SaUdLUKr4zg7+UJwEDe<|hu=dVo>4cpG9 z??>I?)Yw$PG}DqU>~v-9XQsO_zB~SnM17jehzC#(uJ1Y1o3uY(TwYpn!AZtI!Q=WDNxir+>y1twm`|q#)0C)9{-GqNw95dX#^h z#32QZ1`njqO3A~7l=h6=4J-*A>>@q-9rI0AmDQ4lT9rHS#ER6B2F*ZT-ZplbSyKbk zGa2y%Yh5@|l+0-R17Dk|Sl^t{NzD@LFGCL=0d{)!T>Vp4XVM;H;t)=1=2U4nzhT@( zX0j`zeLO$Csh$`%hah}9J?-tj&Zm7D8?9Ko@~#C6tk|wNk1C+CUS1mkm^gbMPe3+D z6%%Kx|0R{Wu}a+<)1YDpSY1d}d*IH94g{1?qkgAl7P3aowoM6>-KOkwVigb6X9{ja z!rozDynU-gn|8q$Cj4=7jdxFmlrp7!&)dsKb`&AfX=ywBP^Rsbn$2cDF?T99wn;A7 zb@sx@iCm8-beF5*(PZ!a674IXI8(6cspXJtLRZo;E;A_;PN7W*@iRn+HscDFKf2oA z{l>Gr_Jz&O;=Lal=2X|_(Z1`ZzugS~2b3Abe`S9R*p*-vS|PTe`C7!MJ-hvEvhIx@ zPlAWxzGNd zXZE2!Q&~yhg(uncO>|+jgq>C_lf6f%=_B6|cSwr+cm&_(T>MD@N_Is_LV#`^6Lc^y zwBDWK)FVQJ%4R|7#(1rfH$K1^jNZp5W27RMQ?OL2z=uWMoNAk}fF7=_ z{tl|<^jv%*YjAl-LSi*zelnSvBX1Bs-NqO%?n;Ec2n9+Q%2Og;uLnT>q!wXEo%$*5 zc|`MyY%wjU+?nBk<4b+*pd*@k6t99AjXE0wde0`vF=4`mcL9l2!Rc+c@>8A&@z|UQ z-NyEMT{Ny6SQo2oh+Hd(Pn8mB#V(7^yjLe(zdL{Y$#KWhGIg7Uh&4R;P!EB@z+UN{ z&L+;vBg+mANioxPn@|X8mlpb1siukM$qyia$!r38cqt#VA<5=6RdZV}^*kN9N-{@;5;>QMQfvC1VllWju&}rWFk%`px zasT)fm_XM323`LdS=Z&=W!>XLwLz|`@5EpCxV|WGLj$@-e~J#4C9};bpU`IioceVy z5nsOHdXi^n6zG4n>l0q3tbVGmWs)uI8a{idB2MbM!A*?x1869I)ou0u9<4i^YE5Lg z5@GnQ()mSnQZh3Tra;I#puaQ=7i{I6+wa@HasCDRe2vmts2Q1^VNQN~!kAv`F?RS4 zAlU+hG6Vuh_e9chFBH$76+B;#tahgT0GYEdvsr}($RMd7t8W|8Hw=F+Ojzt;{jnRn zfgOA=Nx5NmAjo_55YRS12*^F4=Ph_pZ6Vy%Y`mDw+G_z~F@EUwc;}A+&@jdiX)nyn zeQY)^2Wx3IY{1K2M)ZC7!}s+n93Y*+VxtnMGfG=20$f$ejZFThT-8lxS3jRXzjyxp zZKjduv_Eb0Cemnz~mu>!4 zZd1Xh*8v|FCP;h#*!3iR^34&DR}k`|ZDX_&m?`Zf)V)8pqeGVIm&|4n*N(L|Aya-Y z8fvjdfS+YeXkcEHa49eR$fZKEoOf1>w)8{}>U3`Z3!OIfbH$qeo7CV^R5G&3a-K<7 z89~OK8JxENLK9L-(SeH*iAo!Z7cZ*?@I8N{b5r@~s^`JTgf({<`GLIy7bdn_AyxN)* ztUjIJjP^M(=xDc?x?$`8pc#kKjM%o^PpdFPIkA@a7Mqovrab;4 zgE$F~-G66_5AX`twTNu_lPekZ6&h=ppmG@|}W2$*yS<1)3R5pqPCllQo5?XA8UL`sp{^Ma(W zw@$4385I_Sb>GGp2miUbZq87?>X&3ZZg!-5qd4vhW=U5q;ngAVGLF1Rxl6H+3Xsnw zb5z0VFTH;y920)JkvX_~HN{m{>xhR4bN!3>*#kop>~>~K_rd8J!ndaZcJJSPa6{Y-#eabl%p*qa zSnp)9dIyvsVz@`6cxFe(qkEA&r07ev_)5gny&|S+j#{_ld;K&&7mWM!T2kObW>A(# z<}`-@N>*!Z)|89MEqvVN5D?pMLF5gDofTN?XUj@qw2n166C)vCvxp*_C%ekzXh z<)xWdM{8E#ce4l=h2xXSx}-P$Iij?HhX98?e0Q<=W1*Z~W8NX~_QKdc z5he`Im82Zdnw z!AMce(%0$fOmk#i7s_!qX8umV6;-TO)4;ci9Vx{N@tS%dFA7ecKZQXm0XdO_XkbhD zGuFyfV&-FvOxBOkIN_a<2tzHQ2!(S)Pxo*GS>ynl4H?4w*;OY^0{KT*&!@x#P57W$ zo1P;kd_{pC~xejUr^nMyeM4nG_iMuZ18l3whPPd_~d*Vc-r+u zFbyz0QAnXMJfO4u8*Ift7!v=LVf5!~Xwl^^a;dk$^0mOo;^D6nKRhiFP+F|x7w!_~x$5f3XkrylQAFGd~yE*%V%xGqPqVI9s^^Ra#&U$|? z`qG|9EsBs2o~L+~Y0Os^L&|Kd3CPrl4G z&@Anoc$>#7lgvJwY5oF8XVwdqTb#oo?>et~av7<;x=u^TW=&og`X=Ih{C@hfbnU%e zHn_=TuyfngsbJ^mhDCn`Oozjz^>%9kF(x zG#2K4$<+(!gUfOJ^Hdj zmOX!TO1xYym%kUd;-L2oXF9Hl2%U0KO7&~V02WT>v?`?pap{zyw}mPbeT8XIsw;P< zaGJvrp>_O)uS~BwcH&T~C682%vYlTEARMMU-xV_c0^}I>wIFecpLNxZtz&EejlMT z_oGA3ce~BOga_|U-L}NfHMeGwWS%#q5e#Nq-j2&CVVeBmj%2z0x+Y$-rTdWau#iN= zj~_x#+G@b=8Fc3>;)A$CeIY@+_ppfin+3bLXUbJO^p6W3SS-gg1Exv(dIs~XcFJfV zMKm6r7|6d!aUN|KFEAAO%PZ3SXHtBzFJ3BvgQra;o5jDoE@v^5ZNw+e>qtv)#sN65 zdCGTn^U0kNuK0+TQsv*la2SEtXVOfBDtLEEo94YIG&-$ktd-wO5yc zgmwaOZ4jJR&-^#f6MvpBe(jUi<|1Klh6O`dajY7tsDC1o!OeW|^20V8ssx`I@dCnZSKY1rt2c?%Pr&5N+un$? zS#Q!i>9l0X%0rcfV(i}cISHX>RI^#F{}9NX~Ooi9c2uGZr(P9StxqUMc;=#Lk-5_qecbO+E zAd&uD{wf$YGeUI12Zd{%HqmKyDIT|~Z>e6r<9jA*4 zQM@3}Wi=?p{)iuwx4u-m%JC6}$P8c;6#gIw4pa%xa!gFUza|X@U=Ea455Q?YDm~1G zFeJKPtU4{I>xcLnHjotx?AwvRv>UZgd4#k!&$+2*G^fN7t66s@tGGdZLEOWxfJP&) z{4nJ5TvQd}$0!x0y&7v=r5W@1@!i1H?blLo<=^T75oTO=&-0fP-%J>&XwG_#c(le} z+r+l?fzCNttB4kY$Q!U^r@(4xd-)&8tgPQ@CRp!_n@P)YcPl18qj0_;Nx3<}qGzni zIWx%=;Tb_eK3{o%HPF!cNz49N4c4)u`N14zyXgDaT&cI&jGLBHZ+}L!#-~9whkFjF z(XFsBf-(MWF#0Q*TH_b8EfWZ+tjXq9I+f2r}B=@l==Sd>&cE3r?EXp^7)ASa9*WeBQe<<8}d4tY8gZ@ z6#he!_Sead|0+rHOV?3s6>;b5H+t%U36*H{Ui~)Xe$^aFBAhi=L+|FGyX6)A`Wj2b zG+VDRW{w4%nqDTxHghOO+}nk!Wv2KG^uJYzxoNnIx8t8vB3;^JMtRktCsvLA~tp{&`~Wg^Gg6rX@MeSpu^A9#Jo*yc=#WrFeE$ytgmzn z=GZgHiqkQoTUW&B(NeET;ARKkwnbBup6?1VFpqSRzo_JSjO;_*u1#idW!GG+LTGOfiPr<~8GKmPdHhIyL!!vPqL zDX-!m!9n9r3qnQEAf_P%6YMm1^8dAWr9n+?={AUhh=2lu%t#m<(Z+-^OjkhyLSz;M zf&xN_0wIht1Vv^9LBMNJ20=g>A_y|00RrenP>4jsoB%S&kjoSy!FTZ0eckoycE9_& zs&7|U zy@^3a@x^FCujtfY(Mm*t^}p7Oa^xzjb~K)S0(6ag2wKOIPDFy@mDO-do<5zJ5ALTi`DVFl_MZOpgU0BuNdEAErE^J_jh{0P+^*964zhcJ zHIR)q5j3}Z*&MrYYUyoy{&d&p1KMc@!YLWmiT-~~v;W@=z$G*c{Z*VLDo1aUfVy`{-~6_HQWO7Cb-28Ad`>1ox~ zX&mn}#U4`mMiwnDSNgZ{>JQ${kTgBaHB0_=0)JcDx@1R4afp+)`hbkklgPU`qSklN zb*WSVByp^Fzw?7@yZJzAmCKT>!EypWdU1R>%z}LRO1k4)>3tD^eic?pmT#_(rSS?I z9gVf)DN8twl-TN!duef#0dAG_Nx%h4-*{*uD&})>!~Qd9(ohyS!?az_LDAD}-;<|9 z{#pUO1qyWWK%0*zv)=Z=57Kv(yUMP%g`fGzO0tWQdI$txEh7)k$x-i5CUjMp&Yp5g zyJUUo0iTXkVI>B+CoXvqK~w9mzeSf=xHymHiEid|~raN>^B6(SO4Vbi|?NgCaJ0pEM~yf;qnvR?3=8yrCq?RKnS_Q)Ux~|m`!%w7%l+`9 zoowyT4%a9-?Uam=V!Ghd(#GBH)S@U|A<`@1uL0EoUBNR%0(hWB!DZI!R16-X`@*St zxO+Vvl3pLASCkJ*^#5l~O$RB7*K8^0)-U`Es)Tf{*oeR7Rn)ej$1%~e$>t!BYzrE^yNA;^1kG#hthRZMGxWQk>3Gr*8 z)Zs_>XQqpHhwzf&9(!&hL3&{*?bpN7AD(1|o;&%&KjEI&bwYgX<#;#*8Zs*J2#VDKBz@ zSsBVlJo6KRZWe5cY<;8#cM`Ym zR!v1IKfWuS-1q)RBuep9mlI`za3U#GBff&VHnZIIE?D;HD}jtNeRVn?ngCK|G42X@ z44r#Z{+T{D;LZrl6AW6=`_W1^NOgUKrUgU+zRGBfI8WuGJTLT>|D%wyq?#Tt?%cXs z=>g|a~zd&2ld(mzwe&gv_}Vp?qeq1NbQVb>MZMWrs&ajgKb%{r$bThr=T_5iYYi z*DJDM@eRH*b8V~Uk?Z#Lp+275$Fk6$4Z#o|x+s_OV#nQ_qq=0DJP`i;hd*u$)81~u zQZg(J_9&7}UWDcL( z{)s~@}fJZ>FZc{iWm2s(V`iCGW-}rqs>Jv z#=Q@U4$EtkKbA}}X-~o8LD@i$eCJ}Cu{j97-8eVOObg`1Mk#g`C=|STVvPr-CmQ}J z*pKL3-~w(q_0X~*N-oVfq)3dUyyV3tBmrGsoX0TMje&F*x>fqrn9|A;T-sYu&}59_ z%%IbI<68KcrpwkJYj)?Q?aMH}oQ_`VT+3q@t>&g~9cyT;t4r7Yi}W}3%{nkp8Fq$y z(98&;bL-Sh@*PEr@QZXrvnx5pzoX#)K8*RZ8s`{V;^7?zen$*fN~%z{&%L;3MnCnh zm3)lc$sFj@@_9HTqG|ur5Eq_JW?~t_ZM2iI#{w~G`e&F zUXi@!B2_!`kdP~pCq!tnrAG+0-fg0`lE{Tr-(_QfCWmyZCyi>);(*JEWL5hK+c$D%6?G;BvE`zLzQgUct$B`LdP7xBtZ4| z4?#9!hVH9&DmGrN@_LiUYi+JN>*K}6H}{*b;{PMf{Lj4rQSV3wF1(h;MnIlLe*Cnz z$%111q)f&}bYc|DN})m0WzlM@&nbeU6WGjV|LXDznHG^Lgs>;kiJi9tRBqcmW_IrSrSV6Ji)ng!ii*C0!n{^9VS90}rf609wH9`5Nq);6mzy76 z`a<)}yw=d9z7aik&>?E7CAHs|M!s%E6Nf8cGCf=k^Z4D;e(SN}p9a)l(FMS5V)N?7 zAz76rZ;qp<5XrbDOGKQU;8&Hc-jC(Lr))IEHY@Rvq98lid`-U`Vn*&}>{{KmHCv5ky!w7*NtcaiGG?%{Q@RjbB!w^w6C)9#yBzCD!wH z`VF<{7hMHL!UuYpD)#d)#A+|| z-iE;g#26US6X1vqwsLiLPqGQ`+Z&Qwmtjzv+2S7W_ui7>i#wO<0^K#`6WRXk1pTr6 ztQJ^pKMPDli3u1UaDxxm->ZdiZ77#_I-bYJ_P96JI=Edc*c)y2P;iAyhRFV~T2>^{ z<2~8_CC#Ze35G6UUVT#m+6d$>j3 zuf+;`OsSe0F{51UzU`AQ1_QC3PV`v&r5YGXIcn1)$ z(S2+!>7?M%uBH;#P{J*iJ;T0zM6Ds++5Tc(gCM-Ep5*^dKK!9OP|cwjQnb-{Jw}(SE5LLGuw`>A|BLqR?euXjLIE@P!g~++HC0)v zjh4MJ@xa^&FZ+25IKYS_+qv*l-pqA>w0zjWzYfjwAUfgRW)s3XT zg8-Y~-}H;CVLFe0eq1wxfnBR*bdt>8xaZo`Kac6%h*vsMU`M+85XZW|`aASs zZI`y~`97*o8q#u5lJ>ClPN)Dd2jaE!Ux~wQkpD;jGA&NSfuu6R&4FZcSjn(~@WO`G z)q-K;Sx^uP5E1^?5%muYMt@k-(^@&7+Wm$34*x1+~5Nc{_ddMMe!Z1v!RMuOF=lN(f zdUtn6({xc&TU%RiZ*N_N8X6i>sZ=aSC=?P0EEY>$g&+vZzb}V^|US4JxCcpP=Z*Q-zt_pXfR4S!X)$v?fTwE;79gD?a z7zRP`{{CL*)ZgD$e1VP8g$3mM-CSx|6g_)6&k!&`bZ@s;}wcG7Zr_*FI)!qg{kn{8NqH}a} zb5oq$-`_9j5(KfZu&}cDL&73W(+-CtolX~pDT;ErTOH@loJ%xh^g)#Ce*gwOZ}?__#u$`276jIIc5? z-EJQo91H{kCC}qZv0ANfZ*SsDPcRsi=mkO0*Vh-%^N~m-5C~8d6_3YRmgRXq6bc!Q z#wu+L217|3j^l=hhZPEiQmNe7*oZ_T(f^Z3B=X5@Hk-+0IF1u`pP!$vQV7R!u?^4j zJ3Bl10ES^D0}`+-+u7MEF{#w<{{8(eE;0;rb#>L(*9U@NGMW6VtN1aZGAjr$hOE?_X>-8p+slt-6+A!!U(P zGMRjMc+lx|P>zp>oJV e<@hfE0RR7hy@C12{34eC0000 + +
+ +OpenAudio + +
+ +Advanced Text-to-Speech Model Series -!!! warning - We assume no responsibility for any illegal use of the codebase. Please refer to the local laws regarding DMCA (Digital Millennium Copyright Act) and other relevant laws in your area.
- This codebase is released under Apache 2.0 license and all models are released under the CC-BY-NC-SA-4.0 license. +Try it now: Fish Audio Playground | Learn more: OpenAudio Website -## Requirements + -- GPU Memory: 12GB (Inference) -- System: Linux, Windows +--- -## Setup +!!! warning "Legal Notice" + We assume no responsibility for any illegal use of the codebase. Please refer to the local laws regarding DMCA (Digital Millennium Copyright Act) and other relevant laws in your area. + + **License:** This codebase is released under Apache 2.0 license and all models are released under the CC-BY-NC-SA-4.0 license. -First, we need to create a conda environment to install the packages. +## **Introduction** -```bash +We are excited to announce that we have rebranded to **OpenAudio** - introducing a brand new series of advanced Text-to-Speech models that builds upon the foundation of Fish-Speech with significant improvements and new capabilities. -conda create -n fish-speech python=3.12 -conda activate fish-speech +**Openaudio-S1-mini**: [Video](To Be Uploaded); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini); -pip install sudo apt-get install portaudio19-dev # For pyaudio -pip install -e . # This will download all rest packages. +**Fish-Speech v1.5**: [Video](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5); -apt install libsox-dev ffmpeg # If needed. +## **Highlights** ✨ + +### **Emotion Control** +OpenAudio S1 **supports a variety of emotional, tone, and special markers** to enhance speech synthesis: + +- **Basic emotions**: +``` +(angry) (sad) (excited) (surprised) (satisfied) (delighted) +(scared) (worried) (upset) (nervous) (frustrated) (depressed) +(empathetic) (embarrassed) (disgusted) (moved) (proud) (relaxed) +(grateful) (confident) (interested) (curious) (confused) (joyful) ``` -!!! warning - The `compile` option is not supported on windows and macOS, if you want to run with compile, you need to install trition by yourself. +- **Advanced emotions**: +``` +(disdainful) (unhappy) (anxious) (hysterical) (indifferent) +(impatient) (guilty) (scornful) (panicked) (furious) (reluctant) +(keen) (disapproving) (negative) (denying) (astonished) (serious) +(sarcastic) (conciliative) (comforting) (sincere) (sneering) +(hesitating) (yielding) (painful) (awkward) (amused) +``` -## Acknowledgements +- **Tone markers**: +``` +(in a hurry tone) (shouting) (screaming) (whispering) (soft tone) +``` -- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2) -- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) -- [GPT VITS](https://github.com/innnky/gpt-vits) -- [MQTTS](https://github.com/b04901014/MQTTS) -- [GPT Fast](https://github.com/pytorch-labs/gpt-fast) -- [Transformers](https://github.com/huggingface/transformers) -- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) +- **Special audio effects**: +``` +(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting) +(groaning) (crowd laughing) (background laughter) (audience laughing) +``` + +You can also use Ha,ha,ha to control, there's many other cases waiting to be explored by yourself. + +### **Excellent TTS quality** + +We use Seed TTS Eval Metrics to evaluate the model performance, and the results show that OpenAudio S1 achieves **0.008 WER** and **0.004 CER** on English text, which is significantly better than previous models. (English, auto eval, based on OpenAI gpt-4o-transcribe, speaker distance using Revai/pyannote-wespeaker-voxceleb-resnet34-LM) + +| Model | Word Error Rate (WER) | Character Error Rate (CER) | Speaker Distance | +|-------|----------------------|---------------------------|------------------| +| **S1** | **0.008** | **0.004** | **0.332** | +| **S1-mini** | **0.011** | **0.005** | **0.380** | + +### **Two Type of Models** + +| Model | Size | Availability | Features | +|-------|------|--------------|----------| +| **S1** | 4B parameters | Avaliable on [fish.audio](fish.audio) | Full-featured flagship model | +| **S1-mini** | 0.5B parameters | Avaliable on huggingface [hf space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | Distilled version with core capabilities | + +Both S1 and S1-mini incorporate online Reinforcement Learning from Human Feedback (RLHF). + +## **Features** + +1. **Zero-shot & Few-shot TTS:** Input a 10 to 30-second vocal sample to generate high-quality TTS output. **For detailed guidelines, see [Voice Cloning Best Practices](https://docs.fish.audio/text-to-speech/voice-clone-best-practices).** + +2. **Multilingual & Cross-lingual Support:** Simply copy and paste multilingual text into the input box—no need to worry about the language. Currently supports English, Japanese, Korean, Chinese, French, German, Arabic, and Spanish. + +3. **No Phoneme Dependency:** The model has strong generalization capabilities and does not rely on phonemes for TTS. It can handle text in any language script. + +4. **Highly Accurate:** Achieves a low CER (Character Error Rate) of around 0.4% and WER (Word Error Rate) of around 0.8% for Seed-TTS Eval. + +5. **Fast:** With fish-tech acceleration, the real-time factor is approximately 1:5 on an Nvidia RTX 4060 laptop and 1:15 on an Nvidia RTX 4090. + +6. **WebUI Inference:** Features an easy-to-use, Gradio-based web UI compatible with Chrome, Firefox, Edge, and other browsers. + +7. **GUI Inference:** Offers a PyQt6 graphical interface that works seamlessly with the API server. Supports Linux, Windows, and macOS. [See GUI](https://github.com/AnyaCoder/fish-speech-gui). + +8. **Deploy-Friendly:** Easily set up an inference server with native support for Linux, Windows (MacOS comming soon), minimizing speed loss. + +## **Disclaimer** + +We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws. + +## **Media & Demos** + +#### 🚧 Coming Soon +Video demonstrations and tutorials are currently in development. + +## **Documentation** + +### Quick Start +- [Build Environment](en/install.md) - Set up your development environment +- [Inference Guide](en/inference.md) - Run the model and generate speech + + +## **Community & Support** + +- **Discord:** Join our [Discord community](https://discord.gg/Es5qTB9BcN) +- **Website:** Visit [OpenAudio.com](https://openaudio.com) for latest updates +- **Try Online:** [Fish Audio Playground](https://fish.audio) diff --git a/docs/en/inference.md b/docs/en/inference.md index a1eb81d..4d9cf17 100644 --- a/docs/en/inference.md +++ b/docs/en/inference.md @@ -34,9 +34,7 @@ python fish_speech/models/text2semantic/inference.py \ --text "The text you want to convert" \ --prompt-text "Your reference text" \ --prompt-tokens "fake.npy" \ - --checkpoint-path "checkpoints/openaudio-s1-mini" \ - --num-samples 2 \ - --compile # if you want a faster speed + --compile ``` This command will create a `codes_N` file in the working directory, where N is an integer starting from 0. @@ -50,15 +48,12 @@ This command will create a `codes_N` file in the working directory, where N is a ### 3. Generate vocals from semantic tokens: -#### VQGAN Decoder - !!! warning "Future Warning" We have kept the interface accessible from the original path (tools/vqgan/inference.py), but this interface may be removed in subsequent releases, so please change your code as soon as possible. ```bash python fish_speech/models/dac/inference.py \ -i "codes_0.npy" \ - --checkpoint-path "checkpoints/openaudiio-s1-mini/codec.pth" ``` ## HTTP API Inference diff --git a/docs/en/install.md b/docs/en/install.md new file mode 100644 index 0000000..6830156 --- /dev/null +++ b/docs/en/install.md @@ -0,0 +1,31 @@ +## Requirements + +- GPU Memory: 12GB (Inference) +- System: Linux, WSL + +## Setup + +First you need install pyaudio and sox, which is used for audio processing. + +``` bash +apt install portaudio19-dev libsox-dev ffmpeg +``` + +### Conda + +```bash +conda create -n fish-speech python=3.12 +conda activate fish-speech + +pip install -e . +``` + +### UV + +```bash + +uv sync --python 3.12 +``` + +!!! warning + The `compile` option is not supported on windows and macOS, if you want to run with compile, you need to install trition by yourself. diff --git a/docs/ja/index.md b/docs/ja/index.md index bd937d7..bbb66c7 100644 --- a/docs/ja/index.md +++ b/docs/ja/index.md @@ -1,4 +1,14 @@ -# 紹介 +# OpenAudio (旧 Fish-Speech) + +
+ +
+ +OpenAudio + +
+ +先進的なText-to-Speechモデルシリーズ -!!! warning - このコードベースの違法な使用について、当方は一切の責任を負いません。お住まいの地域のDMCA(デジタルミレニアム著作権法)およびその他の関連法規をご参照ください。
- このコードベースはApache 2.0ライセンスの下でリリースされ、すべてのモデルはCC-BY-NC-SA-4.0ライセンスの下でリリースされています。 +今すぐ試す: Fish Audio Playground | 詳細情報: OpenAudio ウェブサイト -## システム要件 +
-- GPU メモリ:12GB(推論) -- システム:Linux、Windows +--- -## セットアップ +!!! warning "法的通知" + このコードベースの違法な使用について、当方は一切の責任を負いません。お住まいの地域のDMCA(デジタルミレニアム著作権法)およびその他の関連法規をご参照ください。 + + **ライセンス:** このコードベースはApache 2.0ライセンスの下でリリースされ、すべてのモデルはCC-BY-NC-SA-4.0ライセンスの下でリリースされています。 -まず、パッケージをインストールするためのconda環境を作成する必要があります。 +## **紹介** -```bash +私たちは **OpenAudio** への改名を発表できることを嬉しく思います。Fish-Speechを基盤とし、大幅な改善と新機能を加えた、新しい先進的なText-to-Speechモデルシリーズを紹介します。 -conda create -n fish-speech python=3.12 -conda activate fish-speech +**Openaudio-S1-mini**: [動画](アップロード予定); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini); -pip install sudo apt-get install portaudio19-dev # pyaudio用 -pip install -e . # これにより残りのパッケージがすべてダウンロードされます。 +**Fish-Speech v1.5**: [動画](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5); -apt install libsox-dev ffmpeg # 必要に応じて。 +## **ハイライト** ✨ + +### **感情制御** +OpenAudio S1は**多様な感情、トーン、特殊マーカーをサポート**して音声合成を強化します: + +- **基本感情**: +``` +(angry) (sad) (excited) (surprised) (satisfied) (delighted) +(scared) (worried) (upset) (nervous) (frustrated) (depressed) +(empathetic) (embarrassed) (disgusted) (moved) (proud) (relaxed) +(grateful) (confident) (interested) (curious) (confused) (joyful) ``` -!!! warning - `compile`オプションはWindowsとmacOSでサポートされていません。compileで実行したい場合は、tritionを自分でインストールする必要があります。 +- **高度な感情**: +``` +(disdainful) (unhappy) (anxious) (hysterical) (indifferent) +(impatient) (guilty) (scornful) (panicked) (furious) (reluctant) +(keen) (disapproving) (negative) (denying) (astonished) (serious) +(sarcastic) (conciliative) (comforting) (sincere) (sneering) +(hesitating) (yielding) (painful) (awkward) (amused) +``` -## 謝辞 +- **トーンマーカー**: +``` +(in a hurry tone) (shouting) (screaming) (whispering) (soft tone) +``` -- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2) -- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) -- [GPT VITS](https://github.com/innnky/gpt-vits) -- [MQTTS](https://github.com/b04901014/MQTTS) -- [GPT Fast](https://github.com/pytorch-labs/gpt-fast) -- [Transformers](https://github.com/huggingface/transformers) -- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) +- **特殊音響効果**: +``` +(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting) +(groaning) (crowd laughing) (background laughter) (audience laughing) +``` + +Ha,ha,haを使用してコントロールすることもでき、他にも多くの使用法があなた自身の探索を待っています。 + +### **優秀なTTS品質** + +Seed TTS評価指標を使用してモデルのパフォーマンスを評価した結果、OpenAudio S1は英語テキストで**0.008 WER**と**0.004 CER**を達成し、以前のモデルより大幅に改善されました。(英語、自動評価、OpenAI gpt-4o-転写に基づく、話者距離はRevai/pyannote-wespeaker-voxceleb-resnet34-LM使用) + +| モデル | 単語誤り率 (WER) | 文字誤り率 (CER) | 話者距離 | +|-------|----------------------|---------------------------|------------------| +| **S1** | **0.008** | **0.004** | **0.332** | +| **S1-mini** | **0.011** | **0.005** | **0.380** | + +### **2つのモデルタイプ** + +| モデル | サイズ | 利用可能性 | 特徴 | +|-------|------|--------------|----------| +| **S1** | 40億パラメータ | [fish.audio](fish.audio) で利用可能 | 全機能搭載のフラッグシップモデル | +| **S1-mini** | 5億パラメータ | huggingface [hf space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) で利用可能 | コア機能を備えた蒸留版 | + +S1とS1-miniの両方にオンライン人間フィードバック強化学習(RLHF)が組み込まれています。 + +## **機能** + +1. **ゼロショット・フューショットTTS:** 10〜30秒の音声サンプルを入力するだけで高品質なTTS出力を生成します。**詳細なガイドラインについては、[音声クローニングのベストプラクティス](https://docs.fish.audio/text-to-speech/voice-clone-best-practices)をご覧ください。** + +2. **多言語・言語横断サポート:** 多言語テキストを入力ボックスにコピー&ペーストするだけで、言語を気にする必要はありません。現在、英語、日本語、韓国語、中国語、フランス語、ドイツ語、アラビア語、スペイン語をサポートしています。 + +3. **音素依存なし:** このモデルは強力な汎化能力を持ち、TTSに音素に依存しません。あらゆる言語スクリプトのテキストを処理できます。 + +4. **高精度:** Seed-TTS Evalで低い文字誤り率(CER)約0.4%と単語誤り率(WER)約0.8%を達成します。 + +5. **高速:** fish-tech加速により、Nvidia RTX 4060ラップトップでリアルタイム係数約1:5、Nvidia RTX 4090で約1:15を実現します。 + +6. **WebUI推論:** Chrome、Firefox、Edge、その他のブラウザと互換性のあるGradioベースの使いやすいWebUIを備えています。 + +7. **GUI推論:** APIサーバーとシームレスに連携するPyQt6グラフィカルインターフェースを提供します。Linux、Windows、macOSをサポートします。[GUIを見る](https://github.com/AnyaCoder/fish-speech-gui)。 + +8. **デプロイフレンドリー:** Linux、Windows、MacOSの native サポートで推論サーバーを簡単にセットアップし、速度低下を最小化します。 + +## **免責事項** + +コードベースの違法な使用について、当方は一切の責任を負いません。お住まいの地域のDMCAやその他の関連法律をご参照ください。 + +## **メディア・デモ** + +#### 🚧 近日公開 +動画デモとチュートリアルは現在開発中です。 + +## **ドキュメント** + +### クイックスタート +- [環境構築](install.md) - 開発環境をセットアップ +- [推論ガイド](inference.md) - モデルを実行して音声を生成 + +## **コミュニティ・サポート** + +- **Discord:** [Discordコミュニティ](https://discord.gg/Es5qTB9BcN)に参加 +- **ウェブサイト:** 最新アップデートは[OpenAudio.com](https://openaudio.com)をご覧ください +- **オンライン試用:** [Fish Audio Playground](https://fish.audio) diff --git a/docs/ja/inference.md b/docs/ja/inference.md index db4132e..8cbde0d 100644 --- a/docs/ja/inference.md +++ b/docs/ja/inference.md @@ -34,9 +34,7 @@ python fish_speech/models/text2semantic/inference.py \ --text "変換したいテキスト" \ --prompt-text "参照テキスト" \ --prompt-tokens "fake.npy" \ - --checkpoint-path "checkpoints/openaudio-s1-mini" \ - --num-samples 2 \ - --compile # より高速化を求める場合 + --compile ``` このコマンドは、作業ディレクトリに `codes_N` ファイルを作成します(Nは0から始まる整数)。 @@ -50,15 +48,12 @@ python fish_speech/models/text2semantic/inference.py \ ### 3. セマンティックトークンから音声を生成: -#### VQGANデコーダー - !!! warning "将来の警告" 元のパス(tools/vqgan/inference.py)からアクセス可能なインターフェースを維持していますが、このインターフェースは後続のリリースで削除される可能性があるため、できるだけ早くコードを変更してください。 ```bash python fish_speech/models/dac/inference.py \ - -i "codes_0.npy" \ - --checkpoint-path "checkpoints/openaudiio-s1-mini/codec.pth" + -i "codes_0.npy" ``` ## HTTP API推論 @@ -103,5 +98,3 @@ python -m tools.run_webui !!! note `GRADIO_SHARE`、`GRADIO_SERVER_PORT`、`GRADIO_SERVER_NAME` などのGradio環境変数を使用してWebUIを設定できます。 - -お楽しみください! diff --git a/docs/ja/install.md b/docs/ja/install.md new file mode 100644 index 0000000..5d815ab --- /dev/null +++ b/docs/ja/install.md @@ -0,0 +1,30 @@ +## システム要件 + +- GPU メモリ:12GB(推論) +- システム:Linux、WSL + +## セットアップ + +まず、音声処理に使用される pyaudio と sox をインストールする必要があります。 + +``` bash +apt install portaudio19-dev libsox-dev ffmpeg +``` + +### Conda + +```bash +conda create -n fish-speech python=3.12 +conda activate fish-speech + +pip install -e . +``` + +### UV + +```bash +uv sync --python 3.12 +``` + +!!! warning + `compile` オプションは Windows と macOS でサポートされていません。compile で実行したい場合は、triton を自分でインストールする必要があります。 diff --git a/docs/ko/index.md b/docs/ko/index.md index 612d7b8..15cf280 100644 --- a/docs/ko/index.md +++ b/docs/ko/index.md @@ -1,4 +1,14 @@ -# 소개 +# OpenAudio (구 Fish-Speech) + +
+ +
+ +OpenAudio + +
+ +고급 텍스트-음성 변환 모델 시리즈 -!!! warning - 코드베이스의 불법적인 사용에 대해서는 일체 책임을 지지 않습니다. 귀하의 지역의 DMCA(디지털 밀레니엄 저작권법) 및 기타 관련 법률을 참고하시기 바랍니다.
- 이 코드베이스는 Apache 2.0 라이선스 하에 배포되며, 모든 모델은 CC-BY-NC-SA-4.0 라이선스 하에 배포됩니다. +지금 체험: Fish Audio Playground | 자세히 알아보기: OpenAudio 웹사이트 -## 시스템 요구사항 +
-- GPU 메모리: 12GB (추론) -- 시스템: Linux, Windows +--- -## 설치 +!!! warning "법적 고지" + 코드베이스의 불법적인 사용에 대해서는 일체 책임을 지지 않습니다. 귀하의 지역의 DMCA(디지털 밀레니엄 저작권법) 및 기타 관련 법률을 참고하시기 바랍니다. + + **라이선스:** 이 코드베이스는 Apache 2.0 라이선스 하에 배포되며, 모든 모델은 CC-BY-NC-SA-4.0 라이선스 하에 배포됩니다. -먼저 패키지를 설치하기 위한 conda 환경을 만들어야 합니다. +## **소개** -```bash +저희는 **OpenAudio**로의 브랜드 변경을 발표하게 되어 기쁩니다. Fish-Speech를 기반으로 하여 상당한 개선과 새로운 기능을 추가한 새로운 고급 텍스트-음성 변환 모델 시리즈를 소개합니다. -conda create -n fish-speech python=3.12 -conda activate fish-speech +**Openaudio-S1-mini**: [동영상](업로드 예정); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini); -pip install sudo apt-get install portaudio19-dev # pyaudio용 -pip install -e . # 나머지 모든 패키지를 다운로드합니다. +**Fish-Speech v1.5**: [동영상](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5); -apt install libsox-dev ffmpeg # 필요한 경우. +## **주요 특징** ✨ + +### **감정 제어** +OpenAudio S1은 **다양한 감정, 톤, 특수 마커를 지원**하여 음성 합성을 향상시킵니다: + +- **기본 감정**: +``` +(angry) (sad) (excited) (surprised) (satisfied) (delighted) +(scared) (worried) (upset) (nervous) (frustrated) (depressed) +(empathetic) (embarrassed) (disgusted) (moved) (proud) (relaxed) +(grateful) (confident) (interested) (curious) (confused) (joyful) ``` -!!! warning - `compile` 옵션은 Windows와 macOS에서 지원되지 않습니다. compile로 실행하려면 trition을 직접 설치해야 합니다. +- **고급 감정**: +``` +(disdainful) (unhappy) (anxious) (hysterical) (indifferent) +(impatient) (guilty) (scornful) (panicked) (furious) (reluctant) +(keen) (disapproving) (negative) (denying) (astonished) (serious) +(sarcastic) (conciliative) (comforting) (sincere) (sneering) +(hesitating) (yielding) (painful) (awkward) (amused) +``` -## 감사의 말 +- **톤 마커**: +``` +(in a hurry tone) (shouting) (screaming) (whispering) (soft tone) +``` -- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2) -- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) -- [GPT VITS](https://github.com/innnky/gpt-vits) -- [MQTTS](https://github.com/b04901014/MQTTS) -- [GPT Fast](https://github.com/pytorch-labs/gpt-fast) -- [Transformers](https://github.com/huggingface/transformers) -- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) +- **특수 음향 효과**: +``` +(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting) +(groaning) (crowd laughing) (background laughter) (audience laughing) +``` + +Ha,ha,ha를 사용하여 제어할 수도 있으며, 여러분 스스로 탐구할 수 있는 다른 많은 사용법이 있습니다. + +### **뛰어난 TTS 품질** + +Seed TTS 평가 지표를 사용하여 모델 성능을 평가한 결과, OpenAudio S1은 영어 텍스트에서 **0.008 WER**과 **0.004 CER**을 달성하여 이전 모델보다 현저히 향상되었습니다. (영어, 자동 평가, OpenAI gpt-4o-전사 기반, 화자 거리는 Revai/pyannote-wespeaker-voxceleb-resnet34-LM 사용) + +| 모델 | 단어 오류율 (WER) | 문자 오류율 (CER) | 화자 거리 | +|-------|----------------------|---------------------------|------------------| +| **S1** | **0.008** | **0.004** | **0.332** | +| **S1-mini** | **0.011** | **0.005** | **0.380** | + +### **두 가지 모델 유형** + +| 모델 | 크기 | 가용성 | 특징 | +|-------|------|--------------|----------| +| **S1** | 40억 매개변수 | [fish.audio](fish.audio)에서 이용 가능 | 모든 기능을 갖춘 플래그십 모델 | +| **S1-mini** | 5억 매개변수 | huggingface [hf space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini)에서 이용 가능 | 핵심 기능을 갖춘 경량화 버전 | + +S1과 S1-mini 모두 온라인 인간 피드백 강화 학습(RLHF)이 통합되어 있습니다. + +## **기능** + +1. **제로샷 및 퓨샷 TTS:** 10~30초의 음성 샘플을 입력하여 고품질 TTS 출력을 생성합니다. **자세한 가이드라인은 [음성 복제 모범 사례](https://docs.fish.audio/text-to-speech/voice-clone-best-practices)를 참조하세요.** + +2. **다국어 및 교차 언어 지원:** 다국어 텍스트를 입력 상자에 복사하여 붙여넣기만 하면 됩니다. 언어에 대해 걱정할 필요가 없습니다. 현재 영어, 일본어, 한국어, 중국어, 프랑스어, 독일어, 아랍어, 스페인어를 지원합니다. + +3. **음소 의존성 없음:** 이 모델은 강력한 일반화 능력을 가지고 있으며 TTS에 음소에 의존하지 않습니다. 어떤 언어 스크립트의 텍스트도 처리할 수 있습니다. + +4. **높은 정확도:** Seed-TTS Eval에서 약 0.4%의 낮은 문자 오류율(CER)과 약 0.8%의 단어 오류율(WER)을 달성합니다. + +5. **빠른 속도:** fish-tech 가속을 통해 Nvidia RTX 4060 노트북에서 실시간 계수 약 1:5, Nvidia RTX 4090에서 약 1:15를 달성합니다. + +6. **WebUI 추론:** Chrome, Firefox, Edge 및 기타 브라우저와 호환되는 사용하기 쉬운 Gradio 기반 웹 UI를 제공합니다. + +7. **GUI 추론:** API 서버와 원활하게 작동하는 PyQt6 그래픽 인터페이스를 제공합니다. Linux, Windows, macOS를 지원합니다. [GUI 보기](https://github.com/AnyaCoder/fish-speech-gui). + +8. **배포 친화적:** Linux, Windows, MacOS의 네이티브 지원으로 추론 서버를 쉽게 설정하여 속도 손실을 최소화합니다. + +## **면책 조항** + +코드베이스의 불법적인 사용에 대해서는 일체 책임을 지지 않습니다. 귀하 지역의 DMCA 및 기타 관련 법률을 참고하시기 바랍니다. + +## **미디어 및 데모** + +#### 🚧 곧 출시 예정 +동영상 데모와 튜토리얼이 현재 개발 중입니다. + +## **문서** + +### 빠른 시작 +- [환경 구축](install.md) - 개발 환경 설정 +- [추론 가이드](inference.md) - 모델 실행 및 음성 생성 + +## **커뮤니티 및 지원** + +- **Discord:** [Discord 커뮤니티](https://discord.gg/Es5qTB9BcN)에 참여하세요 +- **웹사이트:** 최신 업데이트는 [OpenAudio.com](https://openaudio.com)을 방문하세요 +- **온라인 체험:** [Fish Audio Playground](https://fish.audio) diff --git a/docs/ko/inference.md b/docs/ko/inference.md index b32eaad..268f107 100644 --- a/docs/ko/inference.md +++ b/docs/ko/inference.md @@ -34,9 +34,7 @@ python fish_speech/models/text2semantic/inference.py \ --text "변환하고 싶은 텍스트" \ --prompt-text "참조 텍스트" \ --prompt-tokens "fake.npy" \ - --checkpoint-path "checkpoints/openaudio-s1-mini" \ - --num-samples 2 \ - --compile # 더 빠른 속도를 원한다면 + --compile ``` 이 명령은 작업 디렉토리에 `codes_N` 파일을 생성합니다. 여기서 N은 0부터 시작하는 정수입니다. @@ -50,15 +48,12 @@ python fish_speech/models/text2semantic/inference.py \ ### 3. 의미 토큰에서 음성 생성: -#### VQGAN 디코더 - !!! warning "향후 경고" 원래 경로(tools/vqgan/inference.py)에서 액세스 가능한 인터페이스를 유지하고 있지만, 이 인터페이스는 향후 릴리스에서 제거될 수 있으므로 가능한 한 빨리 코드를 변경해 주세요. ```bash python fish_speech/models/dac/inference.py \ - -i "codes_0.npy" \ - --checkpoint-path "checkpoints/openaudiio-s1-mini/codec.pth" + -i "codes_0.npy" ``` ## HTTP API 추론 @@ -103,5 +98,3 @@ python -m tools.run_webui !!! note `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME`과 같은 Gradio 환경 변수를 사용하여 WebUI를 구성할 수 있습니다. - -즐기세요! diff --git a/docs/ko/install.md b/docs/ko/install.md new file mode 100644 index 0000000..6cddc5f --- /dev/null +++ b/docs/ko/install.md @@ -0,0 +1,30 @@ +## 시스템 요구사항 + +- GPU 메모리: 12GB (추론) +- 시스템: Linux, WSL + +## 설정 + +먼저 오디오 처리에 사용되는 pyaudio와 sox를 설치해야 합니다. + +``` bash +apt install portaudio19-dev libsox-dev ffmpeg +``` + +### Conda + +```bash +conda create -n fish-speech python=3.12 +conda activate fish-speech + +pip install -e . +``` + +### UV + +```bash +uv sync --python 3.12 +``` + +!!! warning + `compile` 옵션은 Windows와 macOS에서 지원되지 않습니다. compile로 실행하려면 triton을 직접 설치해야 합니다. diff --git a/docs/pt/index.md b/docs/pt/index.md index 5477c4d..2f611ba 100644 --- a/docs/pt/index.md +++ b/docs/pt/index.md @@ -1,4 +1,14 @@ -# Introdução +# OpenAudio (anteriormente Fish-Speech) + +
+ +
+ +OpenAudio + +
+ +Série Avançada de Modelos Text-to-Speech -!!! warning - Não assumimos nenhuma responsabilidade pelo uso ilegal da base de código. Consulte as leis locais sobre DMCA (Digital Millennium Copyright Act) e outras leis relevantes em sua área.
- Esta base de código é lançada sob a licença Apache 2.0 e todos os modelos são lançados sob a licença CC-BY-NC-SA-4.0. +Experimente agora: Fish Audio Playground | Saiba mais: Site OpenAudio -## Requisitos +
-- Memória GPU: 12GB (Inferência) -- Sistema: Linux, Windows +--- -## Configuração +!!! warning "Aviso Legal" + Não assumimos nenhuma responsabilidade pelo uso ilegal da base de código. Consulte as leis locais sobre DMCA (Digital Millennium Copyright Act) e outras leis relevantes em sua área. + + **Licença:** Esta base de código é lançada sob a licença Apache 2.0 e todos os modelos são lançados sob a licença CC-BY-NC-SA-4.0. -Primeiro, precisamos criar um ambiente conda para instalar os pacotes. +## **Introdução** -```bash +Estamos empolgados em anunciar que mudamos nossa marca para **OpenAudio** - introduzindo uma nova série de modelos avançados de Text-to-Speech que se baseia na fundação do Fish-Speech com melhorias significativas e novas capacidades. -conda create -n fish-speech python=3.12 -conda activate fish-speech +**Openaudio-S1-mini**: [Vídeo](A ser carregado); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini); -pip install sudo apt-get install portaudio19-dev # Para pyaudio -pip install -e . # Isso baixará todos os pacotes restantes. +**Fish-Speech v1.5**: [Vídeo](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5); -apt install libsox-dev ffmpeg # Se necessário. +## **Destaques** ✨ + +### **Controle Emocional** +O OpenAudio S1 **suporta uma variedade de marcadores emocionais, de tom e especiais** para aprimorar a síntese de fala: + +- **Emoções básicas**: +``` +(angry) (sad) (excited) (surprised) (satisfied) (delighted) +(scared) (worried) (upset) (nervous) (frustrated) (depressed) +(empathetic) (embarrassed) (disgusted) (moved) (proud) (relaxed) +(grateful) (confident) (interested) (curious) (confused) (joyful) ``` -!!! warning - A opção `compile` não é suportada no Windows e macOS, se você quiser executar com compile, precisa instalar o trition por conta própria. +- **Emoções avançadas**: +``` +(disdainful) (unhappy) (anxious) (hysterical) (indifferent) +(impatient) (guilty) (scornful) (panicked) (furious) (reluctant) +(keen) (disapproving) (negative) (denying) (astonished) (serious) +(sarcastic) (conciliative) (comforting) (sincere) (sneering) +(hesitating) (yielding) (painful) (awkward) (amused) +``` -## Agradecimentos +- **Marcadores de tom**: +``` +(in a hurry tone) (shouting) (screaming) (whispering) (soft tone) +``` -- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2) -- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) -- [GPT VITS](https://github.com/innnky/gpt-vits) -- [MQTTS](https://github.com/b04901014/MQTTS) -- [GPT Fast](https://github.com/pytorch-labs/gpt-fast) -- [Transformers](https://github.com/huggingface/transformers) -- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) +- **Efeitos sonoros especiais**: +``` +(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting) +(groaning) (crowd laughing) (background laughter) (audience laughing) +``` + +Você também pode usar Ha,ha,ha para controlar, há muitos outros casos esperando para serem explorados por você mesmo. + +### **Qualidade TTS Excelente** + +Utilizamos as métricas Seed TTS Eval para avaliar o desempenho do modelo, e os resultados mostram que o OpenAudio S1 alcança **0.008 WER** e **0.004 CER** em texto inglês, que é significativamente melhor que modelos anteriores. (Inglês, avaliação automática, baseada na transcrição OpenAI gpt-4o, distância do falante usando Revai/pyannote-wespeaker-voxceleb-resnet34-LM) + +| Modelo | Taxa de Erro de Palavras (WER) | Taxa de Erro de Caracteres (CER) | Distância do Falante | +|-------|----------------------|---------------------------|------------------| +| **S1** | **0.008** | **0.004** | **0.332** | +| **S1-mini** | **0.011** | **0.005** | **0.380** | + +### **Dois Tipos de Modelos** + +| Modelo | Tamanho | Disponibilidade | Características | +|-------|------|--------------|----------| +| **S1** | 4B parâmetros | Disponível em [fish.audio](fish.audio) | Modelo principal com todas as funcionalidades | +| **S1-mini** | 0.5B parâmetros | Disponível no huggingface [hf space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | Versão destilada com capacidades principais | + +Tanto o S1 quanto o S1-mini incorporam Aprendizado por Reforço Online com Feedback Humano (RLHF). + +## **Características** + +1. **TTS Zero-shot e Few-shot:** Insira uma amostra vocal de 10 a 30 segundos para gerar saída TTS de alta qualidade. **Para diretrizes detalhadas, veja [Melhores Práticas de Clonagem de Voz](https://docs.fish.audio/text-to-speech/voice-clone-best-practices).** + +2. **Suporte Multilíngue e Cross-lingual:** Simplesmente copie e cole texto multilíngue na caixa de entrada—não precisa se preocupar com o idioma. Atualmente suporta inglês, japonês, coreano, chinês, francês, alemão, árabe e espanhol. + +3. **Sem Dependência de Fonemas:** O modelo tem fortes capacidades de generalização e não depende de fonemas para TTS. Pode lidar com texto em qualquer script de idioma. + +4. **Altamente Preciso:** Alcança uma baixa Taxa de Erro de Caracteres (CER) de cerca de 0,4% e Taxa de Erro de Palavras (WER) de cerca de 0,8% para Seed-TTS Eval. + +5. **Rápido:** Com aceleração fish-tech, o fator de tempo real é aproximadamente 1:5 em um laptop Nvidia RTX 4060 e 1:15 em um Nvidia RTX 4090. + +6. **Inferência WebUI:** Apresenta uma interface web fácil de usar baseada em Gradio, compatível com Chrome, Firefox, Edge e outros navegadores. + +7. **Inferência GUI:** Oferece uma interface gráfica PyQt6 que funciona perfeitamente com o servidor API. Suporta Linux, Windows e macOS. [Ver GUI](https://github.com/AnyaCoder/fish-speech-gui). + +8. **Amigável para Deploy:** Configure facilmente um servidor de inferência com suporte nativo para Linux, Windows e MacOS, minimizando a perda de velocidade. + +## **Isenção de Responsabilidade** + +Não assumimos nenhuma responsabilidade pelo uso ilegal da base de código. Consulte suas leis locais sobre DMCA e outras leis relacionadas. + +## **Mídia e Demos** + +#### 🚧 Em Breve +Demonstrações em vídeo e tutoriais estão atualmente em desenvolvimento. + +## **Documentação** + +### Início Rápido +- [Configurar Ambiente](install.md) - Configure seu ambiente de desenvolvimento +- [Guia de Inferência](inference.md) - Execute o modelo e gere fala + +## **Comunidade e Suporte** + +- **Discord:** Junte-se à nossa [comunidade Discord](https://discord.gg/Es5qTB9BcN) +- **Site:** Visite [OpenAudio.com](https://openaudio.com) para as últimas atualizações +- **Experimente Online:** [Fish Audio Playground](https://fish.audio) diff --git a/docs/pt/inference.md b/docs/pt/inference.md index d8b9b7f..10b129d 100644 --- a/docs/pt/inference.md +++ b/docs/pt/inference.md @@ -34,9 +34,7 @@ python fish_speech/models/text2semantic/inference.py \ --text "O texto que você quer converter" \ --prompt-text "Seu texto de referência" \ --prompt-tokens "fake.npy" \ - --checkpoint-path "checkpoints/openaudio-s1-mini" \ - --num-samples 2 \ - --compile # se você quiser uma velocidade mais rápida + --compile ``` Este comando criará um arquivo `codes_N` no diretório de trabalho, onde N é um inteiro começando de 0. @@ -50,15 +48,12 @@ Este comando criará um arquivo `codes_N` no diretório de trabalho, onde N é u ### 3. Gerar vocais a partir de tokens semânticos: -#### Decodificador VQGAN - !!! warning "Aviso Futuro" Mantivemos a interface acessível do caminho original (tools/vqgan/inference.py), mas esta interface pode ser removida em versões subsequentes, então por favor altere seu código o mais breve possível. ```bash python fish_speech/models/dac/inference.py \ - -i "codes_0.npy" \ - --checkpoint-path "checkpoints/openaudiio-s1-mini/codec.pth" + -i "codes_0.npy" ``` ## Inferência com API HTTP @@ -103,5 +98,3 @@ python -m tools.run_webui !!! note Você pode usar variáveis de ambiente do Gradio, como `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` para configurar o WebUI. - -Divirta-se! diff --git a/docs/pt/install.md b/docs/pt/install.md new file mode 100644 index 0000000..005237a --- /dev/null +++ b/docs/pt/install.md @@ -0,0 +1,30 @@ +## Requisitos + +- Memória GPU: 12GB (Inferência) +- Sistema: Linux, WSL + +## Configuração + +Primeiro você precisa instalar pyaudio e sox, que são usados para processamento de áudio. + +``` bash +apt install portaudio19-dev libsox-dev ffmpeg +``` + +### Conda + +```bash +conda create -n fish-speech python=3.12 +conda activate fish-speech + +pip install -e . +``` + +### UV + +```bash +uv sync --python 3.12 +``` + +!!! warning + A opção `compile` não é suportada no Windows e macOS, se você quiser executar com compile, precisa instalar o triton por conta própria. diff --git a/docs/zh/index.md b/docs/zh/index.md index 64e373b..bde91b5 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -1,4 +1,14 @@ -# 简介 +# OpenAudio (原 Fish-Speech) + +
+ +
+ +OpenAudio + +
+ +先进的文字转语音模型系列 -!!! warning - 我们不对代码库的任何非法使用承担责任。请参考您所在地区有关 DMCA(数字千年版权法)和其他相关法律的规定。
- 此代码库在 Apache 2.0 许可证下发布,所有模型在 CC-BY-NC-SA-4.0 许可证下发布。 +立即试用: Fish Audio Playground | 了解更多: OpenAudio 网站 -## 系统要求 +
-- GPU 内存:12GB(推理) -- 系统:Linux、Windows +--- -## 安装 +!!! warning "法律声明" + 我们不对代码库的任何非法使用承担责任。请参考您所在地区有关 DMCA(数字千年版权法)和其他相关法律的规定。 + + **许可证:** 此代码库在 Apache 2.0 许可证下发布,所有模型在 CC-BY-NC-SA-4.0 许可证下发布。 -首先,我们需要创建一个 conda 环境来安装包。 +## **介绍** -```bash +我们很高兴地宣布,我们已经更名为 **OpenAudio** - 推出全新的先进文字转语音模型系列,在 Fish-Speech 的基础上进行了重大改进并增加了新功能。 -conda create -n fish-speech python=3.12 -conda activate fish-speech +**Openaudio-S1-mini**: [视频](即将上传); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini); -pip install sudo apt-get install portaudio19-dev # 用于 pyaudio -pip install -e . # 这将下载所有其余的包。 +**Fish-Speech v1.5**: [视频](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5); -apt install libsox-dev ffmpeg # 如果需要的话。 +## **亮点** ✨ + +### **情感控制** +OpenAudio S1 **支持多种情感、语调和特殊标记**来增强语音合成效果: + +- **基础情感**: +``` +(angry) (sad) (excited) (surprised) (satisfied) (delighted) +(scared) (worried) (upset) (nervous) (frustrated) (depressed) +(empathetic) (embarrassed) (disgusted) (moved) (proud) (relaxed) +(grateful) (confident) (interested) (curious) (confused) (joyful) ``` -!!! warning - `compile` 选项在 Windows 和 macOS 上不受支持,如果您想使用 compile 运行,需要自己安装 trition。 +- **高级情感**: +``` +(disdainful) (unhappy) (anxious) (hysterical) (indifferent) +(impatient) (guilty) (scornful) (panicked) (furious) (reluctant) +(keen) (disapproving) (negative) (denying) (astonished) (serious) +(sarcastic) (conciliative) (comforting) (sincere) (sneering) +(hesitating) (yielding) (painful) (awkward) (amused) +``` -## 致谢 +- **语调标记**: +``` +(in a hurry tone) (shouting) (screaming) (whispering) (soft tone) +``` -- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2) -- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) -- [GPT VITS](https://github.com/innnky/gpt-vits) -- [MQTTS](https://github.com/b04901014/MQTTS) -- [GPT Fast](https://github.com/pytorch-labs/gpt-fast) -- [Transformers](https://github.com/huggingface/transformers) -- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) +- **特殊音效**: +``` +(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting) +(groaning) (crowd laughing) (background laughter) (audience laughing) +``` + +您还可以使用 Ha,ha,ha 来控制,还有许多其他用法等待您自己探索。 + +### **卓越的 TTS 质量** + +我们使用 Seed TTS 评估指标来评估模型性能,结果显示 OpenAudio S1 在英文文本上达到了 **0.008 WER** 和 **0.004 CER**,明显优于以前的模型。(英语,自动评估,基于 OpenAI gpt-4o-转录,说话人距离使用 Revai/pyannote-wespeaker-voxceleb-resnet34-LM) + +| 模型 | 词错误率 (WER) | 字符错误率 (CER) | 说话人距离 | +|-------|----------------------|---------------------------|------------------| +| **S1** | **0.008** | **0.004** | **0.332** | +| **S1-mini** | **0.011** | **0.005** | **0.380** | + +### **两种模型类型** + +| 模型 | 规模 | 可用性 | 特性 | +|-------|------|--------------|----------| +| **S1** | 40亿参数 | 在 [fish.audio](fish.audio) 上可用 | 功能齐全的旗舰模型 | +| **S1-mini** | 5亿参数 | 在 huggingface [hf space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) 上可用 | 具有核心功能的蒸馏版本 | + +S1 和 S1-mini 都集成了在线人类反馈强化学习 (RLHF)。 + +## **功能特性** + +1. **零样本和少样本 TTS:** 输入 10 到 30 秒的语音样本即可生成高质量的 TTS 输出。**详细指南请参见 [语音克隆最佳实践](https://docs.fish.audio/text-to-speech/voice-clone-best-practices)。** + +2. **多语言和跨语言支持:** 只需复制粘贴多语言文本到输入框即可——无需担心语言问题。目前支持英语、日语、韩语、中文、法语、德语、阿拉伯语和西班牙语。 + +3. **无音素依赖:** 该模型具有强大的泛化能力,不依赖音素进行 TTS。它可以处理任何语言文字的文本。 + +4. **高度准确:** 在 Seed-TTS Eval 中实现低字符错误率 (CER) 约 0.4% 和词错误率 (WER) 约 0.8%。 + +5. **快速:** 通过 fish-tech 加速,在 Nvidia RTX 4060 笔记本电脑上实时因子约为 1:5,在 Nvidia RTX 4090 上约为 1:15。 + +6. **WebUI 推理:** 具有易于使用的基于 Gradio 的网络界面,兼容 Chrome、Firefox、Edge 和其他浏览器。 + +7. **GUI 推理:** 提供与 API 服务器无缝配合的 PyQt6 图形界面。支持 Linux、Windows 和 macOS。[查看 GUI](https://github.com/AnyaCoder/fish-speech-gui)。 + +8. **部署友好:** 轻松设置推理服务器,原生支持 Linux、Windows 和 MacOS,最小化速度损失。 + +## **免责声明** + +我们不对代码库的任何非法使用承担责任。请参考您当地关于 DMCA 和其他相关法律的规定。 + +## **媒体和演示** + +#### 🚧 即将推出 +视频演示和教程正在开发中。 + +## **文档** + +### 快速开始 +- [构建环境](install.md) - 设置您的开发环境 +- [推理指南](inference.md) - 运行模型并生成语音 + +## **社区和支持** + +- **Discord:** 加入我们的 [Discord 社区](https://discord.gg/Es5qTB9BcN) +- **网站:** 访问 [OpenAudio.com](https://openaudio.com) 获取最新更新 +- **在线试用:** [Fish Audio Playground](https://fish.audio) diff --git a/docs/zh/inference.md b/docs/zh/inference.md index 50ac2a7..de821ad 100644 --- a/docs/zh/inference.md +++ b/docs/zh/inference.md @@ -1,6 +1,6 @@ # 推理 -由于声码器模型已更改,您需要比以前更多的显存,建议使用12GB显存以便流畅推理。 +由于声码器模型已更改,您需要比以前更多的 VRAM,建议使用 12GB 进行流畅推理。 我们支持命令行、HTTP API 和 WebUI 进行推理,您可以选择任何您喜欢的方法。 @@ -17,7 +17,7 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope !!! note 如果您计划让模型随机选择音色,可以跳过此步骤。 -### 1. 从参考音频获取VQ tokens +### 1. 从参考音频获取 VQ 令牌 ```bash python fish_speech/models/dac/inference.py \ @@ -27,38 +27,33 @@ python fish_speech/models/dac/inference.py \ 您应该会得到一个 `fake.npy` 和一个 `fake.wav`。 -### 2. 从文本生成语义tokens: +### 2. 从文本生成语义令牌: ```bash python fish_speech/models/text2semantic/inference.py \ --text "您想要转换的文本" \ --prompt-text "您的参考文本" \ --prompt-tokens "fake.npy" \ - --checkpoint-path "checkpoints/openaudio-s1-mini" \ - --num-samples 2 \ - --compile # 如果您想要更快的速度 + --compile ``` -此命令将在工作目录中创建一个 `codes_N` 文件,其中N是从0开始的整数。 +此命令将在工作目录中创建一个 `codes_N` 文件,其中 N 是从 0 开始的整数。 !!! note - 您可能想要使用 `--compile` 来融合CUDA内核以获得更快的推理速度(约30 tokens/秒 -> 约500 tokens/秒)。 - 相应地,如果您不打算使用加速,可以删除 `--compile` 参数的注释。 + 您可能希望使用 `--compile` 来融合 CUDA 内核以实现更快的推理(~30 令牌/秒 -> ~500 令牌/秒)。 + 相应地,如果您不计划使用加速,可以注释掉 `--compile` 参数。 !!! info - 对于不支持bf16的GPU,您可能需要使用 `--half` 参数。 + 对于不支持 bf16 的 GPU,您可能需要使用 `--half` 参数。 -### 3. 从语义tokens生成人声: - -#### VQGAN 解码器 +### 3. 从语义令牌生成声音: !!! warning "未来警告" - 我们保留了从原始路径(tools/vqgan/inference.py)访问的接口,但此接口可能在后续版本中被移除,请尽快更改您的代码。 + 我们保留了从原始路径(tools/vqgan/inference.py)访问接口的能力,但此接口可能在后续版本中被删除,因此请尽快更改您的代码。 ```bash python fish_speech/models/dac/inference.py \ -i "codes_0.npy" \ - --checkpoint-path "checkpoints/openaudiio-s1-mini/codec.pth" ``` ## HTTP API 推理 diff --git a/docs/zh/install.md b/docs/zh/install.md new file mode 100644 index 0000000..be82665 --- /dev/null +++ b/docs/zh/install.md @@ -0,0 +1,30 @@ +## 系统要求 + +- GPU 内存:12GB(推理) +- 系统:Linux、WSL + +## 安装 + +首先需要安装 pyaudio 和 sox,用于音频处理。 + +``` bash +apt install portaudio19-dev libsox-dev ffmpeg +``` + +### Conda + +```bash +conda create -n fish-speech python=3.12 +conda activate fish-speech + +pip install -e . +``` + +### UV + +```bash +uv sync --python 3.12 +``` + +!!! warning + `compile` 选项在 Windows 和 macOS 上不受支持,如果您想使用 compile 运行,需要自己安装 triton。 diff --git a/mkdocs.yml b/mkdocs.yml index 214c4e3..f2f62f9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Fish Speech +site_name: OpenAudio site_description: Targeting SOTA TTS solutions. site_url: https://speech.fish.audio @@ -12,7 +12,7 @@ copyright: Copyright © 2023-2025 by Fish Audio theme: name: material - favicon: assets/figs/logo-circle.png + favicon: assets/openaudio.png language: en features: - content.action.edit @@ -25,8 +25,7 @@ theme: - search.highlight - search.share - content.code.copy - icon: - logo: fontawesome/solid/fish + logo: assets/openaudio.png palette: # Palette toggle for automatic mode @@ -56,7 +55,8 @@ theme: code: Roboto Mono nav: - - Installation: en/index.md + - Introduction: en/index.md + - Installation: en/install.md - Inference: en/inference.md # Plugins @@ -80,25 +80,29 @@ plugins: name: 简体中文 build: true nav: - - 安装: zh/index.md + - 介绍: zh/index.md + - 安装: zh/install.md - 推理: zh/inference.md - locale: ja name: 日本語 build: true nav: - - インストール: ja/index.md + - はじめに: ja/index.md + - インストール: ja/install.md - 推論: ja/inference.md - locale: pt name: Português (Brasil) build: true nav: - - Instalação: pt/index.md + - Introdução: pt/index.md + - Instalação: pt/install.md - Inferência: pt/inference.md - locale: ko name: 한국어 build: true nav: - - 설치: ko/index.md + - 소개: ko/index.md + - 설치: ko/install.md - 추론: ko/inference.md markdown_extensions: