-
Notifications
You must be signed in to change notification settings - Fork 5
/
c_corpus.py
46 lines (43 loc) · 1.42 KB
/
c_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import json
from tqdm import tqdm
def files(path):
g = os.walk(path)
file=[]
for path,dir_list,file_list in g:
for file_name in file_list:
file.append(os.path.join(path, file_name))
return file
cont=0
with open("train.jsonl",'w') as f:
for i in tqdm(range(1,65),total=64):
items=files("ProgramData/{}".format(i))
for item in items:
js={}
js['label']=item.split('/')[1]
js['index']=str(cont)
js['code']=open(item,encoding='latin-1').read()
f.write(json.dumps(js)+'\n')
cont+=1
with open("valid.jsonl",'w') as f:
for i in tqdm(range(65,81),total=16):
items=files("ProgramData/{}".format(i))
for item in items:
js={}
js['label']=item.split('/')[1]
js['index']=str(cont)
js['code']=open(item,encoding='latin-1').read()
f.write(json.dumps(js)+'\n')
cont+=1
with open("test.jsonl",'w') as f:
for i in tqdm(range(81,105),total=24):
items=files("ProgramData/{}".format(i))
for item in items:
js={}
js['label']=item.split('/')[1]
js['index']=str(cont)
js['code']=open(item,encoding='latin-1').read()
f.write(json.dumps(js)+'\n')
cont+=1