-
Notifications
You must be signed in to change notification settings - Fork 0
/
ParseMETS.hs
283 lines (239 loc) · 12.6 KB
/
ParseMETS.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
{-# LANGUAGE Arrows #-}
{-# LANGUAGE OverloadedStrings #-}
module ParseMETS
(getMETSDocument, documentTagSoupXML, documentTagSoupXML3, documentVertXML, documentVertXML_T, documentStanfordVert, documentPlaintext, documentMetadata, docTokens, stripPages, emptydoc, Document(Document), pages, docId, title, issueDate, dType, language, label) where
import Data.String.Utils
import Morphology
import ParseALTO
import Debug.Trace
import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as BC
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import System.IO
import System.FilePath
import Data.List
import Data.Char
import Data.Monoid
import Data.Time.Clock
import Text.XML.Expat.SAX as SAX
test1 = "Rīgas LattvMu Beedribas Derīgu Grahmatu Nodaļas isdelvums. cirvis"
main = tokenizeplaintext
testgtag = do
(inp,out,err,pid) <- pipeInit
let text = test1
let tokens = (pipeAnalyze inp out) text
putStr $ unlines $ map formatVertToken tokens
putStrLn $ "------------------------"
putStr $ unlines $ map formatVertToken $ gTag test1 tokens
pipeClose inp pid
return ()
tokenizeplaintext = do
(inp,out,err,pid) <- pipeInit
text <- readFile "/Users/pet/Documents/Paraugdokumenti/Pumpurs.txt"
let tokens = map (\x -> unwords $ map word x) $ map (pipeAnalyze inp out) $ map strip $ lines text
writeFile "/Users/pet/Documents/Paraugdokumenti/Pumpurs tokeni.txt" $ unlines tokens
pipeClose inp pid
return ()
samplefolder = "/Users/pet/Documents/LNBall/01-01-01/LNB-Grammata 5/g_001_0300031823/"
metsFile = "0300031823_mets.xml"
altoFile = "0300031823_005_alto.xml"
folderPrefix = "/Users/pet/Documents/LNBall/"
testALTO = do
page <- getALTOPage (samplefolder ++ altoFile)
putStr $ pageTagSoupXML cheapAnalyze page
return ()
data Document = Document {docPath, docId, issueDate, title, label, dType, pagecount, language, textcode :: String, pages :: [Page], pagenames :: [FilePath]} --TODO - possible other metainformation could be extracted from METS
deriving (Show, Eq)
emptydoc = Document {docPath="", docId = "", issueDate = "", title = "", label = "", dType = "", pagecount = "0", language = "", textcode = "", pages = [], pagenames = []}
firstelem s = if (s == []) then "" else (head s)
testMETS = do
start <- getCurrentTime
(inp,out,err,pid) <- pipeInit
d <- getMETSDocument samplefolder metsFile
--B.putStr $ toByteString $ documentTagSoupXML2 d
putStr $ documentVertXML (pipeAnalyze inp out) d
--TIO.putStr $ documentTagSoupXML3 d
stop <- getCurrentTime
putStrLn $ show $ diffUTCTime stop start
pipeClose inp pid
return ()
unpack :: XMLToken -> String
unpack x = BC.toString x
type DocParser = [SAXEvent XMLToken XMLToken] -> Document
--type ValueParser = [SAXEvent XMLToken XMLToken] -> XMLToken
getMETSDocument :: String -> String -> IO Document
getMETSDocument folder filename = do
docText <- B.readFile (folder ++ filename)
document <- processPages $ metsStart folder (SAX.parse defaultParseOptions $ BSL.fromChunks [docText])
let docname = folder ++ takeWhile (/= '.') filename
return $ postProcess $ setId document (replace folderPrefix "" docname)
processPages :: Document -> IO Document
processPages doc = do
pages <- mapM getALTOPage $ pagenames doc
return doc{pages = pages}
--return doc{pages = []} -- Priekš metadatu ģenerēšanas
postProcess :: Document -> Document
postProcess doc =
let oldDate = take 8 $ issueDate doc
in doc{ issueDate = oldDate ++ take (8 - length oldDate) (repeat '0'), title = xmlEscape $ title doc, label = xmlEscape $ title doc}
stripPages :: Document -> Document
stripPages doc = doc {pages = []}
metsStart :: String -> DocParser
metsStart folder ((StartElement "mets:mets" attrs):xs) = getHeading attrs (metadataStart folder xs)
metsStart folder (_:xs) = metsStart folder xs
metsStart _ [] = error "could not find <mets:mets>"
getHeading :: [(XMLToken, XMLToken)] -> Document -> Document
getHeading attrs doc = let
label = getAttribute "LABEL" attrs
dType = getAttribute "TYPE" attrs
in doc {label = label, dType = dType}
metadataStart :: String -> DocParser
metadataStart folder ((StartElement "mets:dmdSec" attrs):xs) = (if metadataValid attrs then getMetadata folder else metadataStart folder) xs
metadataStart folder (_:xs) = metadataStart folder xs
metadataStart _[] = error "could not find <mets:dmdSec>"
--TODO - principā tā dokumenta vilkšana līdzi pēc būtības laikam tak ir Arrow... jāpalasa un jāsaprot
getMetadata :: String -> DocParser
getMetadata folder ((StartElement "mods:title" _):xs) = getTitle (getMetadata folder) xs
getMetadata folder ((StartElement "mods:dateIssued" _):xs) = getDateIssued (getMetadata folder) xs
getMetadata folder ((StartElement "mods:detail" attrs):xs) = if (getAttribute "type" attrs) == "pages" then getPageCount (getMetadata folder) xs else (getMetadata folder) xs
getMetadata folder ((StartElement "mods:languageTerm" attrs):xs) =
if (getAttribute "type" attrs) == "code" then getLanguage (getMetadata folder) xs else
if (getAttribute "type" attrs) == "text" then getTextCode (getMetadata folder) xs else getMetadata folder xs
getMetadata folder ((EndElement "mets:dmdSec"):xs) = getPageFiles folder xs --FIXME - pienjeemums ka page apraksts seko peec metadatiem, kas var nebuut true
getMetadata folder (_:xs) = getMetadata folder xs
getMetadata _ [] = error "could not find end of <mets:dmdSec>"
--FIXME - daudz duplikaacijas ko vareetu iznest kautkaa aaraa
getTitle :: DocParser -> DocParser
getTitle cont ((CharacterData txt):xs ) = (getTitle cont xs){title = unpack txt}
getTitle cont ((EndElement "mods:title"):xs ) = cont xs
getTitle cont (_:xs) = getTitle cont xs
getTitle _ [] = error "problem with title tag"
getDateIssued :: DocParser -> DocParser
getDateIssued cont ((CharacterData txt):xs ) = (getDateIssued cont xs){issueDate = unpack txt}
getDateIssued cont ((EndElement "mods:dateIssued"):xs ) = cont xs
getDateIssued cont (_:xs) = getDateIssued cont xs
getDateIssued _ [] = error "problem with dateIssued tag"
getPageCount :: DocParser -> DocParser
getPageCount cont ((StartElement "mods:partNumber" _):xs) = getPageText cont xs
getPageCount cont ((EndElement "mods:detail"):xs ) = cont xs
getPageCount cont (_:xs) = getPageCount cont xs
getPageCount _ [] = error "problem with detail/partnumber tag"
getPageText :: DocParser -> DocParser
getPageText cont ((CharacterData txt):xs ) = (getPageText cont xs){pagecount = unpack txt}
getPageText cont ((EndElement "mods:partNumber"):xs ) = cont xs
getPageText cont (_:xs) = getPageText cont xs
getPageText _ [] = error "page number not found"
getLanguage :: DocParser -> DocParser
getLanguage cont ((CharacterData txt):xs ) = (getLanguage cont xs){language = unpack txt}
getLanguage cont ((EndElement "mods:languageTerm"):xs ) = cont xs
getLanguage cont (_:xs) = getLanguage cont xs
getLanguage _ [] = error "problem with language tag"
getTextCode :: DocParser -> DocParser
getTextCode cont ((CharacterData txt):xs ) = (getTextCode cont xs){textcode = unpack txt}
getTextCode cont ((EndElement "mods:languageTerm"):xs ) = cont xs
getTextCode cont (_:xs) = getTextCode cont xs
getTextCode _ [] = error "problem with language tag"
metadataValid :: [(XMLToken, XMLToken)] -> Bool
metadataValid attrs = let
sec_id = getAttribute "ID" attrs
in (sec_id == "modsissue" || sec_id == "modsbook")
getPageFiles :: String -> DocParser
getPageFiles folder ((StartElement "mets:fileGrp" attrs):xs) = if (getAttribute "ID" attrs) == "ALTOGRP" then getPage folder xs else getPageFiles folder xs
getPageFiles folder (_:xs) = getPageFiles folder xs
getPageFiles _ [] = error "could not find <mets:fileGrp ID=\"ALTOGRP\">"
getPage :: String -> DocParser
getPage folder ((StartElement "mets:FLocat" attrs):xs) = let
location = getAttribute "xlink:href" attrs
filename = folder ++ (drop 7 location) -- expecting 'file://' at start
document = getPage folder xs
prevPages = pagenames document
in document{ pagenames = filename : prevPages}
getPage folder ((EndElement "mets:fileGrp"):xs) = emptydoc -- TODO - pienjeemums ka aiz lapu datiem vairs nekas svariigs neseko
getPage folder (_:xs) = getPage folder xs
getPage _ [] = error "could not find end of <mets:fileGrp ID=\"ALTOGRP\">"
setId :: Document -> String -> Document
setId doc docpath = doc {docPath = docpath, docId = getFolderName docpath}
getFolderName :: String -> String
getFolderName docpath =
let items = split [pathSeparator] docpath in
if length items >= 2 then last $ init $ items else ""
debugtext :: String -> IO String
debugtext text = do
putStrLn text
hFlush stdout
return text
documentMetadata :: Document -> String
documentMetadata (Document docPath docId issueDate title label dType pagecount language textcode pages pagenames) =
docId ++ "\t" ++ docPath ++ "\t" ++ issueDate ++ "\t" ++ title ++ "\t" ++ label ++ "\t" ++ dType ++ "\t" ++ pagecount ++ "\t" ++ (show $ length pagenames) ++ "\t" ++ language ++ "\t" ++ textcode
documentVertXML :: (String -> [Token]) -> Document -> String
documentVertXML analyze (Document docPath docId issueDate title label dType pagecount language textcode pages pagenames) =
"<doc id=\"" ++ docId ++ "\" " ++
"path=\"" ++ docPath ++ "\" " ++
"date=\"" ++ issueDate ++ "\" " ++
"title=\"" ++ title ++ "\" " ++
"label_original=\"" ++ label ++ "\" " ++
"label=\"" ++ map toLower label ++ "\" " ++
"type=\"" ++ dType ++ "\" "++
"pages=\"" ++ pagecount ++ "\" "++
"realpages=\"" ++ (show $ length pagenames) ++ "\" " ++
"language=\"" ++ language ++ "\" " ++
"textcode=\"" ++ textcode ++ "\">\n" ++
unlines (filter ((/=) "") (map (pageVertXML (if language == "lav" then analyze else nonLVAnalyze)) pages)) ++
"</doc>\n"
documentVertXML_T :: (String -> [Token_T]) -> Document -> T.Text
documentVertXML_T analyze (Document docPath docId issueDate title label dType pagecount language textcode pages pagenames) =
(T.pack ("<doc id=\"" ++ docId ++ "\" " ++
"path=\"" ++ docPath ++ "\" " ++
"date=\"" ++ issueDate ++ "\" " ++
"title=\"" ++ title ++ "\" " ++
"label_original=\"" ++ label ++ "\" " ++
"label=\"" ++ map toLower label ++ "\" " ++
"type=\"" ++ dType ++ "\" "++
"pages=\"" ++ pagecount ++ "\" "++
"realpages=\"" ++ (show $ length pagenames) ++ "\" " ++
"language=\"" ++ language ++ "\" " ++
"textcode=\"" ++ textcode ++ "\">\n")) `T.append`
(T.unlines (filter ((/=) "") (map (pageVertXML_T (if language == "lav" then analyze else nonLVAnalyze_T)) pages))) `T.append`
(T.pack "</doc>\n")
documentStanfordVert :: (String -> [Token_T]) -> Document -> T.Text
documentStanfordVert analyze (Document docPath docId issueDate title label dType pagecount language textcode pages pagenames) =
T.unlines (filter ((/=) "") (map (pageStanfordVert (if language == "lav" then analyze else nonLVAnalyze_T)) pages))
documentTagSoupXML :: (String -> [Token]) -> Document -> String
documentTagSoupXML analyze (Document docPath docId issueDate title label dType pagecount language textcode pages pagenames) =
"<doc id=\"" ++ docId ++ "\" " ++
"path=\"" ++ docPath ++ "\" " ++
"date=\"" ++ issueDate ++ "\" " ++
"title=\"" ++ title ++ "\" " ++
"label=\"" ++ label ++ "\" " ++
"type=\"" ++ dType ++ "\" " ++
"pages=\"" ++ pagecount ++ "\" " ++
"realpages=\"" ++ (show $ length pagenames) ++ "\" " ++
"language=\"" ++ language ++ "\" " ++
"textcode=\"" ++ textcode ++ "\">\n" ++
unlines (map (pageTagSoupXML analyze) pages) ++
"</doc>\n"
documentTagSoupXML3 :: (String -> [Token]) -> Document -> T.Text
documentTagSoupXML3 analyze (Document docPath docId issueDate title label dType pagecount language textcode pages _) =
(T.pack ("<doc id=\"" ++ docId ++ "\" " ++
"path=\"" ++ docPath ++ "\" " ++
"date=\"" ++ issueDate ++ "\" " ++
"title=\"" ++ title ++ "\" " ++
"label=\"" ++ label ++ "\" " ++
"type=\"" ++ dType ++ "\" " ++
"pages=\"" ++ pagecount ++ "\" " ++
"realpages=\"" ++ (show $ length pages) ++ "\" " ++
"language=\"" ++ language ++ "\" " ++
"textcode=\"" ++ textcode ++ "\">\n")) `T.append`
(T.unlines $ map (T.pack . (pageTagSoupXML analyze)) pages) `T.append`
(T.pack "</doc>\n")
documentPlaintext :: (String -> [Token]) -> Document -> String
documentPlaintext analyze (Document _ _ _ _ _ _ _ _ _ pages _) =
unlines $ map (pagePlaintext analyze) pages
docTokens :: Document -> Int
docTokens doc = sum $ map pageTokens $ pages doc
pageTokens :: Page -> Int
pageTokens (Page _ blocks) = sum $ map blockTokens blocks
blockTokens :: TextBlock -> Int
blockTokens (TextBlock _ tokens) = length $ words tokens