-
Notifications
You must be signed in to change notification settings - Fork 1
/
parseXML.py
110 lines (96 loc) · 7.71 KB
/
parseXML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
import unittest
class TestXMLParser(unittest.TestCase):
def test_NAF0041_size(self):
test_fn = 'fixtures/naf/0041.xml'
res = (4190, 2480)
self.assertEqual(getPagesize(test_fn), res)
def test_NAF0041(self):
test_fn = 'fixtures/naf/0041.xml'
resultforNAF0041 = [
[[2168, 210], [2257, 204], [2259, 239], [2170, 245]],
[[300, 224], [1108, 241], [1337, 247], [1914, 255], [1914, 290], [1336, 282], [1107, 276], [299, 259]],
[[309, 315], [579, 303], [1150, 301], [1680, 319], [1913, 328], [1912, 363], [1679, 354], [1150, 336], [580, 338], [310, 350]],
[[303, 377], [489, 377], [987, 375], [1506, 388], [1876, 398], [1875, 433], [1505, 423], [987, 410], [489, 412], [303, 412]],
[[304, 452], [605, 443], [1116, 454], [1543, 468], [1885, 470], [1885, 505], [1542, 503], [1115, 489], [605, 478], [305, 487]],
[[276, 522], [580, 511], [984, 526], [1624, 544], [1926, 549], [1926, 584], [1623, 579], [983, 561], [580, 546], [277, 557]],
[[279, 600], [696, 596], [1258, 604], [1746, 619], [1950, 623], [1949, 658], [1745, 654], [1257, 639], [696, 631], [279, 635]],
[[262, 668], [1303, 681], [1303, 716], [262, 703]], [[1056, 774], [1329, 768], [1516, 768], [1786, 747], [1788, 782], [1517, 803], [1329, 803], [1057, 809]],
[[307, 860], [1107, 871], [1392, 883], [1639, 889], [1851, 885], [1852, 920], [1639, 924], [1391, 918], [1106, 906], [307, 895]],
[[297, 947], [1201, 962], [1201, 997], [297, 982]], [[1026, 1036], [1578, 1044], [1764, 1050], [1763, 1085], [1577, 1079], [1026, 1071]],
[[1173, 1108], [1251, 1104], [1253, 1139], [1175, 1143]],
[[1371, 1110], [1433, 1110], [1433, 1145], [1371, 1145]],
[[770, 1174], [1061, 1180], [1060, 1215], [769, 1209]],
[[1234, 1190], [1341, 1190], [1341, 1225], [1234, 1225]],
[[596, 1228], [745, 1238], [863, 1250], [1242, 1246], [1618, 1250], [1618, 1285], [1242, 1281], [862, 1285], [742, 1273], [594, 1263]],
[[313, 1347], [666, 1349], [1010, 1356], [1301, 1360], [1641, 1374], [1862, 1382], [1861, 1417], [1640, 1409], [1300, 1395], [1009, 1391], [666, 1384], [313, 1382]],
[[310, 1439], [591, 1434], [649, 1432], [1000, 1437], [1274, 1439], [1515, 1449], [1759, 1453], [1892, 1456], [1891, 1491], [1758, 1488], [1514, 1484], [1273, 1474], [1000, 1472], [649, 1467], [592, 1469], [311, 1474]],
[[295, 1531], [660, 1505], [1051, 1514], [1452, 1533], [1688, 1533], [1901, 1536], [1901, 1571], [1688, 1568], [1451, 1568], [1050, 1549], [661, 1540], [297, 1566]],
[[296, 1604], [507, 1610], [765, 1596], [1164, 1596], [1482, 1604], [1911, 1611], [1911, 1646], [1481, 1639], [1164, 1631], [766, 1631], [507, 1645], [295, 1639]],
[[292, 1685], [578, 1681], [932, 1673], [1278, 1679], [1645, 1678], [1897, 1683], [1896, 1718], [1645, 1713], [1278, 1714], [932, 1708], [579, 1716], [292, 1720]],
[[306, 1768], [487, 1763], [831, 1756], [1227, 1760], [1616, 1758], [1914, 1767], [1913, 1802], [1616, 1793], [1227, 1795], [831, 1791], [488, 1798], [307, 1803]],
[[313, 1840], [574, 1838], [1010, 1830], [1394, 1831], [1909, 1840], [1908, 1875], [1394, 1866], [1010, 1865], [574, 1873], [313, 1875]],
[[296, 1917], [625, 1915], [1045, 1912], [1317, 1919], [1583, 1924], [1937, 1932], [1936, 1967], [1582, 1959], [1316, 1954], [1045, 1947], [625, 1950], [296, 1952]],
[[294, 1994], [704, 1997], [1277, 1997], [1761, 1999], [1949, 2014], [1947, 2049], [1760, 2034], [1277, 2032], [704, 2032], [294, 2029]],
[[294, 2064], [655, 2069], [965, 2081], [1391, 2071], [1903, 2079], [1903, 2114], [1391, 2106], [965, 2116], [654, 2104], [294, 2099]],
[[296, 2146], [692, 2144], [941, 2147], [1244, 2149], [1599, 2151], [1903, 2156], [1903, 2191], [1599, 2186], [1244, 2184], [941, 2182], [692, 2179], [296, 2181]],
[[291, 2230], [453, 2226], [724, 2216], [1007, 2216], [1652, 2228], [1923, 2228], [1923, 2263], [1652, 2263], [1007, 2251], [725, 2251], [454, 2261], [292, 2265]],
[[297, 2301], [599, 2305], [1141, 2299], [1914, 2313], [1913, 2348], [1141, 2334], [599, 2340], [297, 2336]], [[304, 2388], [529, 2386], [529, 2421], [304, 2423]],
[[918, 2460], [1409, 2475], [1408, 2510], [917, 2495]],
[[1092, 2560], [1297, 2555], [1298, 2590], [1093, 2595]],
[[729, 2569], [845, 2569], [845, 2604], [729, 2604]],
[[287, 2623], [593, 2635], [929, 2632], [1275, 2640], [1746, 2640], [1958, 2659], [1955, 2694], [1745, 2675], [1275, 2675], [929, 2667], [593, 2670], [286, 2658]],
[[243, 2704], [593, 2704], [974, 2716], [1441, 2722], [1896, 2722], [1896, 2757], [1441, 2757], [973, 2751], [593, 2739], [243, 2739]],
[[259, 2791], [531, 2780], [915, 2791], [1248, 2791], [1580, 2789], [1942, 2804], [1941, 2839], [1579, 2824], [1248, 2826], [915, 2826], [531, 2815], [260, 2826]],
[[244, 2871], [547, 2855], [888, 2862], [1359, 2865], [1722, 2874], [1958, 2877], [1958, 2912], [1721, 2909], [1359, 2900], [888, 2897], [547, 2890], [246, 2906]],
[[268, 2964], [508, 2951], [775, 2944], [1130, 2946], [1536, 2944], [1869, 2946], [1869, 2981], [1536, 2979], [1130, 2981], [775, 2979], [509, 2986], [270, 2999]],
[[274, 3038], [595, 3033], [1408, 3034], [1737, 3039], [1942, 3048], [1941, 3083], [1736, 3074], [1408, 3069], [595, 3068], [274, 3073]],
[[247, 3118], [663, 3125], [1153, 3118], [1647, 3123], [1914, 3123], [1914, 3158], [1647, 3158], [1153, 3153], [663, 3160], [246, 3153]],
[[268, 3214], [893, 3203], [1746, 3214], [1954, 3224], [1953, 3259], [1745, 3249], [893, 3238], [269, 3249]],
[[237, 3294], [684, 3289], [974, 3283], [1387, 3286], [1978, 3296], [1977, 3331], [1387, 3321], [974, 3318], [684, 3324], [237, 3329]],
[[247, 3371], [610, 3370], [610, 3405], [247, 3406]], [[880, 3475], [1387, 3477], [1387, 3512], [880, 3510]],
[[860, 3574], [1391, 3583], [1390, 3618], [859, 3609]],
[[1064, 3648], [1245, 3639], [1246, 3674], [1065, 3683]],
[[297, 3725], [1942, 3742], [1942, 3777], [297, 3760]],
[[253, 3805], [1927, 3827], [1927, 3862], [253, 3840]]]
self.assertEqual(getTextlines(test_fn), resultforNAF0041)
def getPagesize(input_xml):
"""
getPagesize
Returns image dimensions, as found in the XML file.
:param input_xml: A page XML file.
:return:
"""
size_regex = r'imageWidth="(\d+)" imageHeight="(\d+)"'
textfile = open(input_xml, 'r')
size_matches = re.findall(size_regex, textfile.read())
textfile.close()
if not len(size_matches) == 1: raise AssertionError
width, height = size_matches[0]
return int(height), int(width)
def getTextlines(input_xml):
"""
getTextLines
Returns a list of all textlines found in the given page XML input.
:param input_xml: A page XML file
:return: List of textlines. Each textline object contains a list of coordinates of the surrounding polygon.
"""
def parseCoordinateList(l):
res = []
coords_regex = r'(?P<coords>\d*,\d*)'
xcoords_regex = r'(?P<coords>\d*),'
ycoords_regex = r',(?P<coords>\d*)'
coordpairs = re.findall(coords_regex, l)
for i in coordpairs:
xcoords = re.findall(xcoords_regex, i)
ycoords = re.findall(ycoords_regex, i)
if not len(xcoords) == len(ycoords) and len(xcoords) == 1: raise AssertionError
res.append([int(xcoords[0]), int(ycoords[0])])
return res
textline_regex = r'<TextLine id=[.\s\S]*?<Coords points="(?P<textline>[.\s\S]*?)"/>[.\s\S]*?</TextLine>'
textfile = open(input_xml, 'r')
matches = re.findall(textline_regex, textfile.read())
textfile.close()
return [parseCoordinateList(m) for m in matches]
if __name__ == '__main__':
unittest.main()