forked from modernmt/DataCollection
-
Notifications
You must be signed in to change notification settings - Fork 3
/
parseXML.py
112 lines (76 loc) · 3.18 KB
/
parseXML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
#--------------------------------------------------------------------------------------
#Parse the xml to extract the title and the URL of the crawled web pages by ILSP Crawler.
#--------------------------------------------------------------------------------------
import xml.etree.ElementTree as ET
from os import listdir
from os.path import isfile, join
import re
import codecs
import sys
def getInfo(xmlFile):
"""It gets the title and the URL of the crawled by ILSP Crawler."""
ns = {'schema': 'http://www.xces.org/schema/2003'}
tree = ET.parse(xmlFile)
root = tree.getroot()
sourceDesc=root[0][0][2]
title=sourceDesc[0][0][0]
eAddress=sourceDesc[0][0][2][3]
titleText=title.text
eAddressText=eAddress.text
if (titleText is None) :
titleText=""
if (eAddressText is None) :
eAddressText=""
return titleText,eAddressText
def isXMLParsedFile(fileCheck):
"""The xml file that all HTML files are transformed to have a number followed by .xml (e.g 61.xml)"""
return re.search(r'^\d+\.xml$',fileCheck)
def isXMLMappedFile(fileCheck):
"""The xml file with that contains the mapping (e.g 59_12_m.xml)"""
return re.search(r'_[a-z]+.xml$',fileCheck)
def getXMLFiles (directory) :
"It gets the list of files for which I need to take the title and the address."
allFiles = [ f for f in listdir(directory) if isfile(join(directory,f))]
xmlParsedList=[ f for f in allFiles if isXMLParsedFile(f)]
xmlMappedList=[ f for f in allFiles if isXMLMappedFile(f)]
return xmlParsedList,xmlMappedList
def getMapping (fileMapped) :
"""Get the mapping between files from the file name."""
fileRes=re.sub("_[a-z]+\.xml$","",fileMapped)
fileS,fileD=fileRes.split("_")
fileS+=".xml"
fileD+=".xml"
return fileS,fileD
def printMapping (xmlMappedList,fileOutput) :
"""Print the Mapping"""
xmlMapingList=[ getMapping(f) for f in xmlMappedList]
fo= codecs.open(fileOutput, "a", "utf-8")
fo.write("\nFiles Mapped:\n")
fo.write("-----------------------------------------------------\n")
for mapPair in xmlMapingList :
fo.write (mapPair[0]+"\t"+mapPair[1]+"\n")
fo.close()
def printDownloaded (xmlParsedList,fileOutput,directory) :
"""Print the downloaded files """
fo= codecs.open(fileOutput, "a", "utf-8")
fo.write("Files Downloaded:\n")
fo.write("-----------------------------------------------------\n")
for xmlFile in xmlParsedList:
xmlPath=join(directory,xmlFile)
title,url=getInfo(xmlPath)
fo.write(xmlFile+"\t"+title+"\t"+url+"\n")
fo.close()
def main():
print "Read the xml files"
directory=sys.argv[1]
fileOutput=sys.argv[2]
xmlParsedList,xmlMappedList=getXMLFiles (directory)
print "Get the Title and the web address of the crawled pages by ILFSP Crawler."
fo= codecs.open(fileOutput, "w", "utf-8")
fo.close()
printDownloaded (xmlParsedList,fileOutput,directory)
printMapping (xmlMappedList,fileOutput)
print "Done"
if __name__ == '__main__':
main()