Simonlob commited on
Commit
78c6bc3
1 Parent(s): 3df3706

Delete Create_dataset

Browse files
Create_dataset/__init__.py DELETED
@@ -1 +0,0 @@
1
-
 
 
Create_dataset/cr_dataset_script.py DELETED
@@ -1,97 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from datasets import load_dataset
4
- from datasets import Dataset, DatasetDict
5
- from IPython.display import Audio
6
- import scipy
7
- import librosa
8
- from tqdm import tqdm
9
- import re
10
- import os
11
-
12
-
13
- def load_audio(audio_dict:dict)->None:
14
- target_sr = 22050
15
- audio_resampled = librosa.resample(np.array(audio_dict['array']),
16
- orig_sr=audio_dict['sampling_rate'],
17
- target_sr=target_sr)
18
- scipy.io.wavfile.write(audio_dict['path'],
19
- rate=target_sr,
20
- data=(audio_resampled* 32767).astype(np.int16))
21
-
22
- def remove_outer_quotes_regex(sen:str)->str:
23
- return re.sub(r'^["\'](.*)["\']$', r'\1', sen)
24
-
25
- def main()->None:
26
- os.mkdir('kany_dataset')
27
- os.chdir('kany_dataset')
28
- os.mkdir('wavs')
29
- os.chdir('wavs')
30
-
31
-
32
- art = """
33
- /\_/\
34
- ( o.o )
35
- > ^ <
36
-
37
- V O I C E
38
- """
39
- print(art)
40
-
41
- print('--- LOADING DATASET ---')
42
- dataset_kany = load_dataset("Simonlob/Kany_dataset_mk4")
43
-
44
- # mk TRAIN
45
- print()
46
- print('--- CONVERTIND AND SAVING THE TRAIN DATASET ---')
47
- num_shards=20
48
- path = []
49
- text = []
50
-
51
- with tqdm(total=len(dataset_kany['train']), leave=False) as pbar:
52
- for ind in range(num_shards):
53
- dataset_shard = dataset_kany['train'].shard(num_shards=num_shards, index=ind)
54
- for row in dataset_shard:
55
- load_audio(row['audio'])
56
- path.append(row['audio']['path'])
57
- text.append(row['raw_transcription'])
58
- pbar.update(1)
59
-
60
-
61
- absolute_path = os.path.abspath('../')
62
- os.chdir(absolute_path)
63
-
64
- dir = f'{absolute_path}/wavs/'
65
- df = pd.DataFrame({'path':path, 'text':text})
66
- df.text = df.text.map(remove_outer_quotes_regex)
67
- df.path = dir + df.path
68
- df.to_csv('kany_filelist_train.txt', sep='|', header=None, index=False)
69
-
70
- # mk TEST
71
- os.chdir(dir)
72
- path = []
73
- text = []
74
- print()
75
- print('--- CONVERTIND AND SAVING THE TEST DATASET ---')
76
- with tqdm(total=len(dataset_kany['test']), leave=False) as pbar2:
77
- for row in tqdm(dataset_kany['test']):
78
- load_audio(row['audio'])
79
- path.append(row['audio']['path'])
80
- text.append(row['raw_transcription'])
81
- pbar2.update(1)
82
-
83
- os.chdir(absolute_path)
84
- df = pd.DataFrame({'path':path, 'text':text})
85
- df.text = df.text.map(remove_outer_quotes_regex)
86
- df.path = dir + df.path
87
- df.to_csv('kany_filelist_test.txt', sep='|', header=None, index=False)
88
- print()
89
- print('--- THE DATASET IS READY ---')
90
- print(f'Dir of data is "{absolute_path}"')
91
-
92
- absolute_path_home = os.path.abspath('../')
93
- os.chdir(absolute_path_home)
94
-
95
-
96
- if __name__ == "__main__":
97
- main()