Fix timbre problem (#1009)

* [feature]add dataset classs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [dev]combine agent and tts infer

* [feature]:update inference

* [feature]:update uv.lock

* [Merge]:merge upstream/main

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [fix]:remove unused files

* [fix]:remove unused files

* [fix]:remove unused files

* [fix]:fix infer bugs

* [docs]:update introduction and optinize front appearence

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [docs]:update README for OpenAudio-S1

* [docs]:update docs

* [docs]:Update video

* [docs]:fix video

* [docs]:fix video

* [fix]:fix timbre problem

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Whale and Dolphin 2025-06-05 19:21:06 +08:00 committed by GitHub
parent 89ea53dedc
commit 9021a57dce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 128 additions and 27 deletions

130
.gitignore vendored
View File

@ -1,33 +1,113 @@
# =============================================================================
# Fish Speech - .gitignore
# =============================================================================
# Operating System Files
# -----------------------
.DS_Store
.pgx.*
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# IDEs and Editors
# ----------------
.vscode/
.idea/
*.swp
*.swo
*~
# Python
# ------
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# Virtual Environments
# --------------------
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
/fishenv/
# Project Dependencies
# --------------------
.pdm-python
/fish_speech.egg-info
__pycache__
/results
/data
/*.test.sh
# Data and Model Files
# --------------------
/data/
/results/
/checkpoints/
/references/
/demo-audios/
/example/
filelists/
*.filelist
filelists
# Audio Files
# -----------
*.wav
*.mp3
*.flac
*.ogg
*.m4a
# Data Files
# ----------
*.npy
*.npz
*.pkl
*.pickle
*.lab
/fish_speech/text/cmudict_cache.pickle
/checkpoints
/.vscode
/data_server/target
/*.npy
/*.wav
/*.mp3
/*.lab
/results
/data
/.idea
# Cache and Temporary Files
# --------------------------
/.cache/
/.gradio/
/.locale/
.pgx.*
*log
*.log
# External Tools
# --------------
ffmpeg.exe
ffprobe.exe
/faster_whisper/
# Server Related
# --------------
/data_server/target/
# Test Files
# ----------
/*.test.sh
asr-label*
/.cache
/fishenv
/.locale
/demo-audios
/references
/example
/faster_whisper
/.gradio
*log

1
data Symbolic link
View File

@ -0,0 +1 @@
/mnt/users/whaledolphin/data

View File

@ -339,7 +339,7 @@ def generate_long(
temperature: float = 0.8,
compile: bool = False,
iterative_prompt: bool = True,
chunk_length: int = 150,
chunk_length: int = 512,
prompt_text: Optional[str | list[str]] = None,
prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
):
@ -365,6 +365,24 @@ def generate_long(
texts = split_text(text, chunk_length) if iterative_prompt else [text]
max_length = model.config.max_seq_len
# if use_prompt:
# base_content_sequence.append(
# [
# TextPart(text=prompt_text[0]),
# VQPart(codes=prompt_tokens[0]),
# ],
# add_end=True,
# )
# for text in texts:
# content_sequence = ContentSequence(modality=None)
# base_content_sequence.append(
# [
# TextPart(text=text),
# ],
# add_end=True,
# )
if use_prompt:
for t, c in zip(prompt_text, prompt_tokens):
base_content_sequence.append(
@ -385,7 +403,7 @@ def generate_long(
encoded = []
for text in texts:
content_sequence = ContentSequence(modality=None)
content_sequence = ContentSequence(modality="text")
content_sequence.append(TextPart(text=text))
encoded.append(
content_sequence.encode_for_inference(

2
uv.lock generated
View File

@ -942,6 +942,7 @@ dependencies = [
{ name = "cachetools" },
{ name = "datasets" },
{ name = "descript-audio-codec" },
{ name = "descript-audiotools" },
{ name = "einops" },
{ name = "einx", extra = ["torch"] },
{ name = "faster-whisper" },
@ -985,6 +986,7 @@ requires-dist = [
{ name = "cachetools" },
{ name = "datasets", specifier = "==2.18.0" },
{ name = "descript-audio-codec" },
{ name = "descript-audiotools" },
{ name = "einops", specifier = ">=0.7.0" },
{ name = "einx", extras = ["torch"], specifier = "==0.2.2" },
{ name = "faster-whisper" },