forked from pedrobalage/Maltparser-Universal-Tree-Bank-PT-BR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_conll_ptb.py
49 lines (36 loc) · 1.48 KB
/
convert_conll_ptb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
# -*- coding: utf-8 -*-
#### Script to convert conll corpus to ptb format
####
# Author: Pedro Balage ([email protected])
# Date: 25/05/2015
# Version: 1.0
# Python 3 compatibility
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import with_statement
# imports
import os
import codecs
CORPUS_FILE = 'pt-br-corrected/pt-br-universal-train-dev.conll'
TRAIN_FILE = 'pt-br-corrected/pt-br-universal-train-dev.ptb' # to be created
with codecs.open(CORPUS_FILE,'r',encoding='utf8') as input_file:
with codecs.open(TRAIN_FILE,'w',encoding='utf8') as output_file:
grid = list()
for line in input_file:
line = line.strip()
# blank line. Sentence boundary.
# It writes the sentence in WORD_TAG structure in the output_file
if len(line) == 0:
output_file.write( ' '.join([line_items[1] + '_' + line_items[3] for line_items in grid]) + '\n')
grid = list()
continue
line_items = line.split('\t')
if len(line_items) != 10:
print ('Problem, line doesnt have 10 values: \n{0}\n\n'.format(line))
grid.append(line_items)
# at the end of the file, save the last sentence
if len(grid) != 0:
output_file.write( ' '.join([line_items[1] + '' + line_items[3] for line_items in grid]) + '\n')