-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
94 lines (81 loc) · 5.2 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from datasets import load_dataset, load_from_disk
import soundfile as sf, os, re, neologdn, librosa
from tqdm import tqdm
import shutil
dataset = load_dataset("mozilla-foundation/common_voice_17_0", "ja")["train"].filter(lambda sample: bool(sample["sentence"])) #skips samples with no transcript
name = "cv_17"
ouput_dir = "./datasets/"
out_file = 'metadata.csv' # create metadata file with file names and transcripts
os.makedirs(ouput_dir + name, exist_ok=True)
folder_path = ouput_dir + name # Create a folder to store the audio and transcription files
## Removes silence from audio sample - top_db=30 - Then moves trimmed samples to trimmed subfolder
## Also removes samples with only silence - threshold=0.025 - Then moves silent samples to removed
## subfolder for review and creates two csv files; removed and not_removed.
top_db=30
def is_silent(mp3_file, threshold=0.025):
y, sr = librosa.load(mp3_file, sr=None)
rms = librosa.feature.rms(y=y)[0]
return all(value < threshold for value in rms)
def remove_silence(input_file, output_file, top_db=top_db):
y, sr = sf.read(input_file)
intervals = librosa.effects.split(y, top_db=top_db)
y_trimmed = []
for start, end in intervals:
y_trimmed.extend(y[start:end])
if not os.path.exists(output_file):
sf.write(output_file, y_trimmed, sr)
with open(csv_file2, "a") as f:
file_name = os.path.basename(output_file)
f.write(file_name + "\n")
def process_directory(input_dir, output_dir, top_db=top_db):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if not os.path.exists(removed_dir):
os.makedirs(removed_dir)
for filename in os.listdir(input_dir):
if filename.endswith(".mp3"):
input_file = os.path.join(input_dir, filename)
output_file = os.path.join(output_dir, filename)
removed_file = os.path.join(removed_dir, filename)
remove_silence(input_file, output_file, top_db)
if is_silent(output_file):
with open(csv_file, "a") as f:
f.write(output_file + "\n")
shutil.move(output_file, removed_file)
input_dir = folder_path
output_dir = folder_path + "/trimmed/"
removed_dir = folder_path + "/removed/"
csv_file = folder_path + "/removed.csv"
csv_file2 = folder_path + "/not_removed.csv"
process_directory(input_dir, output_dir)
## Downloads audio samples and transcrpits from hugging face datasets creating audio files with matching csv
## contaning transcripts and file_name of your choice. Also does a bit of transcript pre-proccesing (https://github.com/ikegami-yukino/neologdn)
## Japanese specific.
min_char = 4
max = 20.0
min = 1.0
char = '[ 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890♬♪♩♫]'
special_characters = '[“%‘”~゛#$%&()*+:;〈=〉@^_{|}~"█』『.;:<>_()*&^$#@`, ]' #「」
for i, sample in tqdm(enumerate(dataset)): # Process each sample in the filtered dataset
if sample["sentence"] != "":
audio_sample = name + f'_{i}.mp3' # or wav
audio_path = os.path.join(folder_path, audio_sample)
transcription_path = os.path.join(folder_path, out_file) # Path to save transcription file
if not os.path.exists(audio_path):
patterns = [(r"…",'。'), (r"!!",'!'), (special_characters,""), (r"\s+", "")] # (r"(.)\1{2}")
for pattern, replace in patterns:
sample["sentence"] = re.sub(pattern, replace, sample["sentence"])
sample["sentence"] = (neologdn.normalize(sample["sentence"], repeat=1)) # for Japanese only, repeat number reduces repeat characters
if sample["sentence"][-1] not in ["!", "?", "。"]:
sample["sentence"] += "。" # Add a period if it doesn't end with ! or ?
sample["sentence_length"] = len(sample["sentence"]) # Get sentence lengths
sample["audio_length"] = len(sample["audio"]["array"]) / sample["audio"]["sampling_rate"] # Get audio length, remove if not needed
if max > sample["audio_length"] > min and not re.search(char, sample["sentence"]) and sample["sentence_length"] > min_char and bool(sample["sentence"]): #check again before write
sf.write(audio_path, sample['audio']['array'], sample['audio']['sampling_rate']) # Get files
process_directory(folder_path, (folder_path + "/trimmed/")) # for use with audio sample silence removal script
if os.path.isfile(audio_path):
os.remove(audio_path)
with open(transcription_path, 'a', encoding='utf-8') as transcription_file:
transcription_file.write(audio_sample+",") # Save transcription file name
transcription_file.write(sample['sentence']) # Save transcription
transcription_file.write('\n')