From 134098cc4babc9af2f428fb138c8b010e9601130 Mon Sep 17 00:00:00 2001
From: Yuan-Man <68322456+Yuan-ManX@users.noreply.github.com>
Date: Mon, 22 Jan 2024 01:09:10 +0800
Subject: [PATCH] Add process_dataset.py

Support third-party dataset import.
---
 GPT_SoVITS/process_dataset.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 GPT_SoVITS/process_dataset.py

diff --git a/GPT_SoVITS/process_dataset.py b/GPT_SoVITS/process_dataset.py
new file mode 100644
index 0000000..cb5856a
--- /dev/null
+++ b/GPT_SoVITS/process_dataset.py
@@ -0,0 +1,25 @@
+import os
+
+
+# Supporting third-party datasets with the format where each audio file corresponds to a text file.
+# For example, voice01.wav -> voice01.txt.
+def convert_dataset(input_folder, output_file, language='zh'):
+    audio_files = []
+    for f in os.listdir(input_folder):
+        if f.endswith('.wav'):
+            audio_files.append(f)
+
+    with open(output_file, 'w', encoding='utf-8') as output:
+
+        for audio_file in audio_files:
+            audio_path = os.path.join(input_folder, audio_file)
+            text_file = os.path.join(input_folder, audio_file.replace('.wav', '.txt'))
+
+            with open(text_file, 'r', encoding='utf-8') as text_content:
+                text = text_content.read().replace('\n', '')
+
+            speaker_name = os.path.splitext(audio_file)[0]
+
+            output_line = f'{audio_path}|{speaker_name}|{language}|{text}\n'
+            output.write(output_line)
+