Fix timbre problem (#1009)

* [feature]add dataset classs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [dev]combine agent and tts infer * [feature]:update inference * [feature]:update uv.lock * [Merge]:merge upstream/main * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [fix]:remove unused files * [fix]:remove unused files * [fix]:remove unused files * [fix]:fix infer bugs * [docs]:update introduction and optinize front appearence * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [docs]:update README for OpenAudio-S1 * [docs]:update docs * [docs]:Update video * [docs]:fix video * [docs]:fix video * [fix]:fix timbre problem * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-06-05 19:21:06 +08:00 · 2025-06-05 19:21:06 +08:00 · 9021a57dce
commit 9021a57dce
parent 89ea53dedc
4 changed files with 128 additions and 27 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,33 +1,113 @@
+# =============================================================================
+# Fish Speech - .gitignore
+# =============================================================================
+
+# Operating System Files
+# -----------------------
 .DS_Store
-.pgx.*
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# IDEs and Editors
+# ----------------
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Python
+# ------
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual Environments
+# --------------------
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+/fishenv/
+
+# Project Dependencies
+# --------------------
 .pdm-python
 /fish_speech.egg-info
-__pycache__
-/results
-/data
-/*.test.sh
+
+# Data and Model Files
+# --------------------
+/data/
+/results/
+/checkpoints/
+/references/
+/demo-audios/
+/example/
+filelists/
 *.filelist
-filelists
+
+# Audio Files
+# -----------
+*.wav
+*.mp3
+*.flac
+*.ogg
+*.m4a
+
+# Data Files
+# ----------
+*.npy
+*.npz
+*.pkl
+*.pickle
+*.lab
 /fish_speech/text/cmudict_cache.pickle
-/checkpoints
-/.vscode
-/data_server/target
-/*.npy
-/*.wav
-/*.mp3
-/*.lab
-/results
-/data
-/.idea
+
+# Cache and Temporary Files
+# --------------------------
+/.cache/
+/.gradio/
+/.locale/
+.pgx.*
+*log
+*.log
+
+# External Tools
+# --------------
 ffmpeg.exe
 ffprobe.exe
+/faster_whisper/
+
+# Server Related
+# --------------
+/data_server/target/
+
+# Test Files
+# ----------
+/*.test.sh
 asr-label*
-/.cache
-/fishenv
-/.locale
-/demo-audios
-/references
-/example
-/faster_whisper
-/.gradio
-*log
--- a/1
+++ b/1
@ -0,0 +1 @@
+/mnt/users/whaledolphin/data
--- a/fish_speech/models/text2semantic/inference.py
+++ b/fish_speech/models/text2semantic/inference.py
@ -339,7 +339,7 @@ def generate_long(
    temperature: float = 0.8,
    compile: bool = False,
    iterative_prompt: bool = True,
-    chunk_length: int = 150,
+    chunk_length: int = 512,
    prompt_text: Optional[str | list[str]] = None,
    prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
 ):
@ -365,6 +365,24 @@ def generate_long(
    texts = split_text(text, chunk_length) if iterative_prompt else [text]
    max_length = model.config.max_seq_len

+    # if use_prompt:
+    #     base_content_sequence.append(
+    #         [
+    #             TextPart(text=prompt_text[0]),
+    #             VQPart(codes=prompt_tokens[0]),
+    #         ],
+    #         add_end=True,
+    #     )
+
+    # for text in texts:
+    #     content_sequence = ContentSequence(modality=None)
+    #     base_content_sequence.append(
+    #         [
+    #             TextPart(text=text),
+    #         ],
+    #         add_end=True,
+    #     )
+
    if use_prompt:
        for t, c in zip(prompt_text, prompt_tokens):
            base_content_sequence.append(
@ -385,7 +403,7 @@ def generate_long(

    encoded = []
    for text in texts:
-        content_sequence = ContentSequence(modality=None)
+        content_sequence = ContentSequence(modality="text")
        content_sequence.append(TextPart(text=text))
        encoded.append(
            content_sequence.encode_for_inference(
--- a/uv.lock
+++ b/uv.lock
@ -942,6 +942,7 @@ dependencies = [
    { name = "cachetools" },
    { name = "datasets" },
    { name = "descript-audio-codec" },
+    { name = "descript-audiotools" },
    { name = "einops" },
    { name = "einx", extra = ["torch"] },
    { name = "faster-whisper" },
@ -985,6 +986,7 @@ requires-dist = [
    { name = "cachetools" },
    { name = "datasets", specifier = "==2.18.0" },
    { name = "descript-audio-codec" },
+    { name = "descript-audiotools" },
    { name = "einops", specifier = ">=0.7.0" },
    { name = "einx", extras = ["torch"], specifier = "==0.2.2" },
    { name = "faster-whisper" },