-
Notifications
You must be signed in to change notification settings - Fork 2
/
msc-admin.py
126 lines (115 loc) · 4.1 KB
/
msc-admin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from firebase_admin import storage
import argparse
import sys
import csv
import shutil
import os
def parse_args(args):
parser = argparse.ArgumentParser(
description="Backend data processor for Malayalam speech corpus app"
)
parser.add_argument(
"-i",
"--infile",
metavar="INPUT.txt",
help="Input text file containing sentences per line",
nargs="?",
)
parser.add_argument(
"-c",
"--category",
help="Category of the input text corpus.",
default="default",
nargs="?",
)
return parser.parse_args(args)
class MSCFirestore:
def __init__(self):
cred = credentials.Certificate(
"malayalam-speech-corpora-firebase-adminsdk-2dav1-46f57ee2b2.json"
)
firebase_admin.initialize_app(
cred, {
"databaseURL": "https://malayalam-speech-corpora.firebaseio.com",
"storageBucket": "malayalam-speech-corpora.appspot.com"
}
)
db = firestore.client()
self.sentences = db.collection("sentences")
self.speech = db.collection("speech")
self.users = db.collection("users")
def add_sentence(self, sentence, category):
docs = self.sentences.where('sentence', '==', sentence).stream()
exist = False
for doc in docs:
exist = True
if category != doc.to_dict()['category']:
print("[Edit] %s %s %s" % (doc.id, sentence, category))
self.sentences.document(doc.id).set({
"sentence": sentence,
"category": category
})
else:
print("[Skip] %s %s %s" % (doc.id, sentence, category))
if not exist:
self.sentences.add({
"sentence": sentence,
"category": category
})
print("[Add] %s %s" % (sentence, category))
def remove_sentence(self, key):
self.sentences.child(key).delete()
def process_input_file(infile, category, dbref):
with open(infile, "r") as file:
for sentence in file:
dbref.add_sentence(sentence.rstrip(), category)
def save_data(docs, outfile, keys):
with open(outfile, "w") as tsvout:
tsvout = csv.writer(tsvout, delimiter='\t')
index=0
for doc in docs:
docDict=doc.to_dict()
if index==0:
tsvout.writerow(keys)
values=[]
for key in keys:
if key=='id':
values.append(doc.id)
else:
values.append(docDict.get(key,'default'))
tsvout.writerow(values)
index += 1
print('Wrote {} items to {}'.format(index, outfile))
def download_samples(speech_filename):
# print('Downloading samples...', speech_filename)
client = storage.bucket().client
blobs = client.list_blobs('malayalam-speech-corpora.appspot.com', prefix=speech_filename)
for blob in blobs:
try:
blob.download_to_filename(blob.name)
except:
pass
def main(args=None):
options = parse_args(args)
firestore = MSCFirestore()
if options.infile:
process_input_file(options.infile, options.category, firestore )
else:
sentences=firestore.sentences.stream()
save_data(sentences, 'sentences.tsv', keys=['id', 'sentence', 'category'])
speech=firestore.speech.stream()
save_data(speech, 'speech.tsv', keys=['id', 'sentence', 'user', 'fileName','sample','time', 'vote'])
users=firestore.users.stream()
save_data(users, 'users.tsv', keys=['id', 'name', 'gender', 'ageGroup' ])
shutil.rmtree('audio', ignore_errors=True)
os.mkdir('audio')
f = open('speech.tsv')
speech_tsvfile = csv.reader(f,delimiter="\t")
for columns in speech_tsvfile:
speech_filename = columns[3]
download_samples(speech_filename)
if __name__ == "__main__":
sys.exit(main())