diff --git a/Dockerfile b/Dockerfile index b7a31e3c..4e5a9dbb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ RUN apt-get update && \ # Copy only requirements.txt initially to leverage Docker cache WORKDIR /workspace COPY requirements.txt /workspace/ -RUN pip install --no-cache-dir -r requirements.txt +RUN CMAKE_POLICY_VERSION_MINIMUM=3.5 pip install --no-cache-dir -r requirements.txt # Define a build-time argument for image type ARG IMAGE_TYPE=full @@ -39,4 +39,4 @@ COPY . /workspace EXPOSE 9871 9872 9873 9874 9880 -CMD ["python", "api.py"] \ No newline at end of file +CMD ["python", "api.py"] diff --git a/api.py b/api.py index f34d86af..fe404c67 100644 --- a/api.py +++ b/api.py @@ -173,6 +173,9 @@ import config as global_config import logging import subprocess +import nltk +nltk.download('averaged_perceptron_tagger_eng') + class DefaultRefer: def __init__(self, path, text, language): @@ -1086,9 +1089,10 @@ async def tts_endpoint(request: Request): @app.get("/") async def tts_endpoint( - refer_wav_path: str = None, - prompt_text: str = None, - prompt_language: str = None, + refer_wav_path: str = "saotome/saotome-6s.wav", + prompt_text: str = "今日は友達と一緒に映画を見に行く予定ですが、天気が悪くて少し心配です。", + prompt_language: str = "all_ja", + character: str = "saotome", text: str = None, text_language: str = None, cut_punc: str = None, @@ -1096,10 +1100,11 @@ async def tts_endpoint( top_p: float = 1.0, temperature: float = 1.0, speed: float = 1.0, - inp_refs: list = Query(default=[]), + inp_refs: list = Query(default=["saotome/saotome-10s.wav", "saotome/refs/ref1.wav", "saotome/refs/ref2.wav", "saotome/refs/ref3.wav", "saotome/refs/ref4.wav"]), sample_steps: int = 32, if_sr: bool = False ): + print(f"the base path is {refer_wav_path}") return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr) diff --git a/requirements.txt b/requirements.txt index 16f8c8d4..5c7d80dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,42 +1,40 @@ -numpy==1.23.4 -scipy -tensorboard -librosa==0.9.2 -numba==0.56.4 -pytorch-lightning -gradio>=4.0,<=4.24.0 -ffmpeg-python -onnxruntime; sys_platform == 'darwin' -onnxruntime-gpu; sys_platform != 'darwin' -tqdm -funasr==1.0.27 -torch<2.4 -cn2an -pypinyin -pyopenjtalk>=0.3.4 -g2p_en -torchaudio -modelscope==1.10.0 -sentencepiece -transformers>=4.43 -peft -chardet -PyYAML -psutil -jieba_fast -jieba -split-lang -fast_langdetect>=0.3.0 -Faster_Whisper -wordsegment -rotary_embedding_torch -ToJyutping -g2pk2 -ko_pron -opencc; sys_platform != 'linux' -opencc==1.1.1; sys_platform == 'linux' -python_mecab_ko; sys_platform != 'win32' -fastapi<0.112.2 -x_transformers -torchmetrics<=1.5 -attrdict +numpy==1.23.4 +scipy==1.13.1 +tensorboard==2.19.0 +librosa==0.9.2 +numba==0.56.4 +pytorch-lightning==2.5.1 +gradio==4.24.0 +ffmpeg-python==0.2.0 +onnxruntime==1.19.2 #(for macOS) +onnxruntime-gpu==1.19.2 #(for non-macOS) +tqdm==4.67.1 +funasr==1.0.27 +cn2an==0.5.23 +pypinyin==0.53.0 +pyopenjtalk==0.4.0 +g2p-en==2.1.0 +torchaudio==2.6.0 +modelscope==1.10.0 +sentencepiece==0.2.0 +transformers==4.50.0 +peft==0.15.0 +chardet==5.2.0 +PyYAML==6.0.2 +psutil==7.0.0 +jieba_fast==0.53 +jieba==0.42.1 +split-lang==2.1.0 +fast-langdetect==0.3.1 +faster-whisper==1.1.1 +wordsegment==1.3.1 +rotary-embedding-torch==0.8.6 +ToJyutping==3.2.0 +g2pk2==0.0.3 +ko-pron==1.3 +opencc==1.1.1 #(for Linux) +python-mecab-ko==1.3.7 #(not for Windows) +fastapi==0.112.1 # (since <0.112.2) +x-transformers==2.1.37 +torchmetrics==1.5.0 +attrdict==2.0.1 diff --git a/saotome/refs/ref1.wav b/saotome/refs/ref1.wav new file mode 100644 index 00000000..8e145909 Binary files /dev/null and b/saotome/refs/ref1.wav differ diff --git a/saotome/refs/ref2.wav b/saotome/refs/ref2.wav new file mode 100644 index 00000000..cea4e013 Binary files /dev/null and b/saotome/refs/ref2.wav differ diff --git a/saotome/refs/ref3.wav b/saotome/refs/ref3.wav new file mode 100644 index 00000000..63d933ea Binary files /dev/null and b/saotome/refs/ref3.wav differ diff --git a/saotome/refs/ref4.wav b/saotome/refs/ref4.wav new file mode 100644 index 00000000..a1d35c3f Binary files /dev/null and b/saotome/refs/ref4.wav differ diff --git a/saotome/saotome-10s.wav b/saotome/saotome-10s.wav new file mode 100644 index 00000000..c5dfdf53 Binary files /dev/null and b/saotome/saotome-10s.wav differ diff --git a/saotome/saotome-6s.wav b/saotome/saotome-6s.wav new file mode 100644 index 00000000..6154f29a Binary files /dev/null and b/saotome/saotome-6s.wav differ