-
Notifications
You must be signed in to change notification settings - Fork 47
/
parse.go
316 lines (271 loc) · 8.85 KB
/
parse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
package docx
import (
"container/list"
"encoding/xml"
"errors"
"fmt"
"io"
"log"
"regexp"
)
const (
// RunElementName is the local name of the XML tag for runs (<w:r>, </w:r> and <w:r/>)
RunElementName = "r"
// TextElementName is the local name of the XML tag for text-runs (<w:t> and </w:t>)
TextElementName = "t"
)
var (
// RunOpenTagRegex matches all OpenTags for runs, including eventually set attributes
RunOpenTagRegex = regexp.MustCompile(`(<w:r).*>`)
// RunCloseTagRegex matches the close tag of runs
RunCloseTagRegex = regexp.MustCompile(`(</w:r>)`)
// RunSingletonTagRegex matches a singleton run tag
RunSingletonTagRegex = regexp.MustCompile(`(<w:r/>)`)
// TextOpenTagRegex matches all OpenTags for text-runs, including eventually set attributes
TextOpenTagRegex = regexp.MustCompile(`(<w:t).*>`)
// TextCloseTagRegex matches the close tag of text-runs
TextCloseTagRegex = regexp.MustCompile(`(</w:t>)`)
// ErrTagsInvalid is returned if the parsing failed and the result cannot be used.
// Typically this means that one or more tag-offsets were not parsed correctly which
// would cause the document to become corrupted as soon as replacing starts.
ErrTagsInvalid = errors.New("one or more tags are invalid and will cause the XML to be corrupt")
)
// RunParser can parse a list of Runs from a given byte slice.
type RunParser struct {
doc []byte
runs DocumentRuns
runStack list.List
}
// NewRunParser returns an initialized RunParser given the source-bytes.
func NewRunParser(doc []byte) *RunParser {
return &RunParser{
doc: doc,
runs: DocumentRuns{},
}
}
// Execute will fire up the parser.
// The parser will do two passes on the given document.
// First, all <w:r> tags are located and marked.
// Then, inside that run tags the <w:t> tags are located.
func (parser *RunParser) Execute() error {
err := parser.findRuns()
if err != nil {
return err
}
err = parser.findTextRuns()
if err != nil {
return err
}
return ValidatePositions(parser.doc, parser.runs)
}
// Runs returns the all runs found by the parser.
func (parser *RunParser) Runs() DocumentRuns {
return parser.runs
}
// FindRuns will search through the document and return all runs found.
// The text tags are not analyzed at this point, that'str the next step.
func (parser *RunParser) findRuns() error {
// use a custom reader which saves the current byte position
docReader := NewReader(string(parser.doc))
decoder := xml.NewDecoder(docReader)
tmpRun := NewEmptyRun()
singleton := false
// nestCount holds the nesting-level. It is going to be incremented on every OpenTag and decremented
// on every CloseTag.
nestCount := 0
// popRun will pop the last Run from the runStack if there is any on the stack
popRun := func() *Run {
r := parser.runStack.Back().Value.(*Run)
parser.runStack.Remove(parser.runStack.Back())
return r
}
// nextIteration resets the temporary values used inside the for-loop to be ready for the next iteration
// This is used after a run has been fully analyzed (OpenTag and CloseTag were found).
// As long as there are runs on the runStack, they will be popped from it.
// Only when the stack is empty, a new empty Run struct is created.
nextIteration := func() {
nestCount -= 1
if nestCount > 0 {
tmpRun = popRun()
} else {
tmpRun = NewEmptyRun()
}
singleton = false
}
for {
tok, err := decoder.Token()
if tok == nil || err == io.EOF {
break
}
if err != nil {
return fmt.Errorf("error getting token: %s", err)
}
switch elem := tok.(type) {
case xml.StartElement:
if elem.Name.Local == RunElementName {
nestCount += 1
if nestCount > 1 {
parser.runStack.PushBack(tmpRun)
tmpRun = NewEmptyRun()
}
// tagEndPos points to '>' of the tag
tagEndPos := docReader.Pos()
// tagStartPos points to '<' of the tag
tagStartPos := parser.findOpenBracketPos(tagEndPos - 1)
tmpRun.OpenTag = Position{
Start: tagStartPos,
End: tagEndPos,
}
// special case, a singleton tag: <w:r/> is also considered to be a start element
// since there is no real end tag, the element is marked for the EndElement case to handle it appropriately
tagStr := string(parser.doc[tagStartPos:tagEndPos])
if RunSingletonTagRegex.MatchString(tagStr) {
singleton = true
}
}
case xml.EndElement:
if elem.Name.Local == RunElementName {
// if the run is a singleton tag, it was already identified by the xml.StartElement case
// in that case, the CloseTag is the same as the openTag and no further work needs to be done
if singleton {
tmpRun.CloseTag = tmpRun.OpenTag
parser.runs = append(parser.runs, tmpRun) // run is finished
nextIteration()
break
}
// tagEndPos points to '>' of the tag
tagEndPos := docReader.Pos()
// tagStartPos points to '<' of the tag
tagStartPos := parser.findOpenBracketPos(tagEndPos - 1)
// add CloseTag and finish the run
tmpRun.CloseTag = Position{
Start: tagStartPos,
End: tagEndPos,
}
parser.runs = append(parser.runs, tmpRun)
nextIteration()
}
}
}
if nestCount != 0 {
log.Printf("invalid nestCount, should be 0 but is %d\n", nestCount)
return ErrTagsInvalid
}
return nil
}
func (parser *RunParser) findTextRuns() error {
// use a custom reader which saves the current byte position
docReader := NewReader(string(parser.doc))
decoder := xml.NewDecoder(docReader)
// based on the current position, find out in which run we're at
inRun := func(pos int64) *Run {
for _, run := range parser.runs {
if run.OpenTag.Start < pos && pos < run.CloseTag.End {
return run
}
}
return nil
}
for {
tok, err := decoder.Token()
if tok == nil || err == io.EOF {
break
}
if err != nil {
return fmt.Errorf("error getting token: %s", err)
}
switch elem := tok.(type) {
case xml.StartElement:
if elem.Name.Local == TextElementName {
// tagEndPos points to '>' of the tag
tagEndPos := docReader.Pos()
// tagStartPos points to '<' of the tag
tagStartPos := parser.findOpenBracketPos(tagEndPos - 1)
currentRun := inRun(docReader.Pos())
if currentRun == nil {
return fmt.Errorf("unable to find currentRun for text start-element")
}
currentRun.HasText = true
currentRun.Text.OpenTag = Position{
Start: tagStartPos,
End: tagEndPos,
}
}
case xml.EndElement:
if elem.Name.Local == TextElementName {
// tagEndPos points to '>' of the tag
tagEndPos := docReader.Pos()
// tagStartPos points to '<' of the tag. -1 is required since Pos() points after the '>'
tagStartPos := parser.findOpenBracketPos(tagEndPos - 1)
currentRun := inRun(docReader.Pos())
if currentRun == nil {
return fmt.Errorf("unable to find currentRun for text end-element")
}
currentRun.Text.CloseTag = Position{
Start: tagStartPos,
End: tagEndPos,
}
}
}
}
return nil
}
// findOpenBracketPos searches the matching '<' for a close bracket ('>') given it's position.
func (parser *RunParser) findOpenBracketPos(endBracketPos int64) int64 {
var found bool
for i := endBracketPos; !found; i-- {
if string(parser.doc[i]) == "<" {
return i
}
}
return 0
}
// ValidatePositions will iterate over all runs and their texts (if any) and ensure that they match
// their respective regex.
// If the validation failed, the replacement will not work since offsets are wrong.
func ValidatePositions(document []byte, runs []*Run) error {
parsingFailed := false
for _, run := range runs {
// singleton tags must not be validated
if run.OpenTag.Match(RunSingletonTagRegex, document) {
continue
}
if !run.OpenTag.Match(RunOpenTagRegex, document) {
log.Println("RunOpenTagRegex failed to match", run.String(document))
parsingFailed = true
}
if !run.CloseTag.Match(RunCloseTagRegex, document) {
log.Println("RunCloseTagRegex failed to match", run.String(document))
parsingFailed = true
}
if run.HasText {
if !run.Text.OpenTag.Match(TextOpenTagRegex, document) {
log.Println("TextOpenTagRegex failed to match", run.String(document))
parsingFailed = true
}
if !run.Text.CloseTag.Match(TextCloseTagRegex, document) {
log.Println("TextCloseTagRegex failed to match", run.String(document))
parsingFailed = true
}
}
}
if parsingFailed {
return ErrTagsInvalid
}
return nil
}
// Position is a generic position of a tag, represented by byte offsets
type Position struct {
Start int64
End int64
}
// Match will apply a MatchString using the given regex on the given data and returns true if the position
// matches the regex inside the data.
func (p Position) Match(regexp *regexp.Regexp, data []byte) bool {
return regexp.MatchString(string(data[p.Start:p.End]))
}
// Valid returns true if Start <= End.
// Only then the position can be used, otherwise there will be a 'slice out of bounds' along the way.
func (p Position) Valid() bool {
return p.Start <= p.End
}