-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathTextExtractor.cpp
700 lines (603 loc) · 24.4 KB
/
TextExtractor.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
// TextExtractor.cpp : Implementation of CTextExtractor
#define STRICT
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0400
#endif
#define _ATL_APARTMENT_THREADED
#include <atlbase.h>
//You may derive a class from CComModule and use it if you want to override
//something, but do not change the name of _Module
extern CComModule _Module;
#include <atlcom.h>
#include <string>
#include <sstream>
#include "dispimpl2.h"
#include "ExtractText.h"
#include "TextExtractor.h"
#include "Filter.h"
#include "FiltErr.h"
#include "NTQuery.h"
/////////////////////////////////////////////////////////////////////////////
// CTextExtractor
STDMETHODIMP CTextExtractor::InterfaceSupportsErrorInfo(REFIID riid)
{
static const IID* arr[] =
{
&IID_ITextExtractor
};
for (int i=0; i < sizeof(arr) / sizeof(arr[0]); i++)
{
if (InlineIsEqualGUID(*arr[i],riid))
return S_OK;
}
return S_FALSE;
}
// Per W3C spec http://www.w3.org/TR/REC-xml#charsets
// Valid characters are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
// [#x10000-#x10FFFF]
inline static void ValidUnicode(wchar_t & ch)
{
if (ch < 0x0020) // if less than ASCII space
{
if ((ch == 0x000D) // CR
|| (ch == 0x000A) // or LF
|| (ch == 0x0009)) // or TAB
return; // it's valid!
else
ch = L' '; // morph to blank
}
else if (ch > 0x007e) // or greater than ASCII '~'
{
if (ch <= 0xD7FF)
return; // it's valid!
else if (ch >= 0xF8FF && ch <= 0xFFFD)
return; // it's valid!
else
ch = L' '; // morph to blank
// note we don't support surrogates, private use or high-Unicode 0x10000-0x10FFFF characters
}
else
return; // it's valid!
}
static void CleanUpCharacters(size_t chBuf, wchar_t *buf)
{
// The game here is to fold any "cute" versions of characters to thier
// simplified form to make parsing easier.
buf[chBuf] = 0; // must be null terminated..
for (size_t i = 0; i < chBuf; ++i)
{
wchar_t & ch = buf[i];
switch (ch)
{
case 0: // embedded null
case 0x2000: // en quad
case 0x2001: // em quad
case 0x2002: // en space
case 0x2003: // em space
case 0x2004: // three-per-em space
case 0x2005: // four-per-em space
case 0x2006: // six-per-em space
case 0x2007: // figure space
case 0x2008: // puctuation space
case 0x2009: // thin space
case 0x200A: // hair space
case 0x200B: // zero-width space
case 0x200C: // zero-width non-joiner
case 0x200D: // zero-width joiner
case 0x202f: // no-break space
case 0x3000: // ideographic space
ch = L' ';
break;
case 0x00B6: // pilcro
case 0x2028: // line seperator
case 0x2029: // paragraph seperator
ch = L'\n';
break;
case 0x00AD: // soft-hyphen
case 0x00B7: // middle dot
case 0x2010: // hyphen
case 0x2011: // non-breaking hyphen
case 0x2012: // figure dash
case 0x2013: // en dash
case 0x2014: // em dash
case 0x2015: // quote dash
case 0x2027: // hyphenation point
case 0x2043: // hyphen bullet
case 0x208B: // subscript minus
case 0xFE31: // vertical em dash
case 0xFE32: // vertical en dash
case 0xFE58: // small em dash
case 0xFE63: // small hyphen minus
ch = L'-';
break;
case 0x00B0: // degree
case 0x2018: // left single quote
case 0x2019: // right single quote
case 0x201A: // low right single quote
case 0x201B: // high left single quote
case 0x2032: // prime
case 0x2035: // reversed prime
case 0x2039: // left-pointing angle quotation mark
case 0x203A: // right-pointing angle quotation mark
ch = L'\'';
break;
case 0x201C: // left double quote
case 0x201D: // right double quote
case 0x201E: // low right double quote
case 0x201F: // high left double quote
case 0x2033: // double prime
case 0x2034: // triple prime
case 0x2036: // reversed double prime
case 0x2037: // reversed triple prime
case 0x00AB: // left-pointing double angle quotation mark
case 0x00BB: // right-pointing double angle quotation mark
case 0x3003: // ditto mark
case 0x301D: // reversed double prime quotation mark
case 0x301E: // double prime quotation mark
case 0x301F: // low double prime quotation mark
ch = L'\"';
break;
case 0x00A7: // section-sign
case 0x2020: // dagger
case 0x2021: // double-dagger
case 0x2022: // bullet
case 0x2023: // triangle bullet
case 0x203B: // reference mark
case 0xFE55: // small colon
ch = L':';
break;
case 0x2024: // one dot leader
case 0x2025: // two dot leader
case 0x2026: // elipsis
case 0x3002: // ideographic full stop
case 0xFE30: // two dot vertical leader
case 0xFE52: // small full stop
ch = L'.';
break;
case 0x3001: // ideographic comma
case 0xFE50: // small comma
case 0xFE51: // small ideographic comma
ch = L',';
break;
case 0xFE54: // small semicolon
ch = L';';
break;
case 0x00A6: // broken-bar
case 0x2016: // double vertical line
ch = L'|';
break;
case 0x2017: // double low line
case 0x203E: // overline
case 0x203F: // undertie
case 0x2040: // character tie
case 0xFE33: // vertical low line
case 0xFE49: // dashed overline
case 0xFE4A: // centerline overline
case 0xFE4D: // dashed low line
case 0xFE4E: // centerline low line
ch = L'_';
break;
case 0x301C: // wave dash
case 0x3030: // wavy dash
case 0xFE34: // vertical wavy low line
case 0xFE4B: // wavy overline
case 0xFE4C: // double wavy overline
case 0xFE4F: // wavy low line
ch = L'~';
break;
case 0x2038: // caret
case 0x2041: // caret insertion point
ch = L'^';
break;
case 0x2030: // per-mille
case 0x2031: // per-ten thousand
case 0xFE6A: // small per-cent
ch = L'%';
break;
case 0xFE6B: // small commercial at
ch = L'@';
break;
case 0x00A9: // copyright
ch = L'c';
break;
case 0x00B5: // micro
ch = L'u';
break;
case 0x00AE: // registered
ch = L'r';
break;
case 0x207A: // superscript plus
case 0x208A: // subscript plus
case 0xFE62: // small plus
ch = L'+';
break;
case 0x2044: // fraction slash
ch = L'/';
break;
case 0x2042: // asterism
case 0xFE61: // small asterisk
ch = L'*';
break;
case 0x208C: // subscript equal
case 0xFE66: // small equal
ch = L'=';
break;
case 0xFE68: // small reverse solidus
ch = L'\\';
break;
case 0xFE5F: // small number sign
ch = L'#';
break;
case 0xFE60: // small ampersand
ch = L'&';
break;
case 0xFE69: // small dollar sign
ch = L'$';
break;
case 0x2045: // left square bracket with quill
case 0x3010: // left black lenticular bracket
case 0x3016: // left white lenticular bracket
case 0x301A: // left white square bracket
case 0xFE3B: // vertical left lenticular bracket
case 0xFF41: // vertical left corner bracket
case 0xFF43: // vertical white left corner bracket
ch = L'[';
break;
case 0x2046: // right square bracket with quill
case 0x3011: // right black lenticular bracket
case 0x3017: // right white lenticular bracket
case 0x301B: // right white square bracket
case 0xFE3C: // vertical right lenticular bracket
case 0xFF42: // vertical right corner bracket
case 0xFF44: // vertical white right corner bracket
ch = L']';
break;
case 0x208D: // subscript left parenthesis
case 0x3014: // left tortise-shell bracket
case 0x3018: // left white tortise-shell bracket
case 0xFE35: // vertical left parenthesis
case 0xFE39: // vertical left tortise-shell bracket
case 0xFE59: // small left parenthesis
case 0xFE5D: // small left tortise-shell bracket
ch = L'(';
break;
case 0x208E: // subscript right parenthesis
case 0x3015: // right tortise-shell bracket
case 0x3019: // right white tortise-shell bracket
case 0xFE36: // vertical right parenthesis
case 0xFE3A: // vertical right tortise-shell bracket
case 0xFE5A: // small right parenthesis
case 0xFE5E: // small right tortise-shell bracket
ch = L')';
break;
case 0x3008: // left angle bracket
case 0x300A: // left double angle bracket
case 0xFF3D: // vertical left double angle bracket
case 0xFF3F: // vertical left angle bracket
case 0xFF64: // small less-than
ch = L'<';
break;
case 0x3009: // right angle bracket
case 0x300B: // right double angle bracket
case 0xFF3E: // vertical right double angle bracket
case 0xFF40: // vertical right angle bracket
case 0xFF65: // small greater-than
ch = L'>';
break;
case 0xFE37: // vertical left curly bracket
case 0xFE5B: // small left curly bracket
ch = L'{';
break;
case 0xFE38: // vertical right curly bracket
case 0xFE5C: // small right curly bracket
ch = L'}';
break;
case 0x00A1: // inverted exclamation mark
case 0x00AC: // not
case 0x203C: // double exclamation mark
case 0x203D: // interrobang
case 0xFE57: // small exclamation mark
ch = L'!';
break;
case 0x00BF: // inverted question mark
case 0xFE56: // small question mark
ch = L'?';
break;
case 0x00B9: // superscript one
ch = L'1';
break;
case 0x00B2: // superscript two
ch = L'2';
break;
case 0x00B3: // superscript three
ch = L'3';
break;
case 0x2070: // superscript zero
case 0x2074: // superscript four
case 0x2075: // superscript five
case 0x2076: // superscript six
case 0x2077: // superscript seven
case 0x2078: // superscript eight
case 0x2079: // superscript nine
case 0x2080: // subscript zero
case 0x2081: // subscript one
case 0x2082: // subscript two
case 0x2083: // subscript three
case 0x2084: // subscript four
case 0x2085: // subscript five
case 0x2086: // subscript six
case 0x2087: // subscript seven
case 0x2088: // subscript eight
case 0x2089: // subscript nine
case 0x3021: // Hangzhou numeral one
case 0x3022: // Hangzhou numeral two
case 0x3023: // Hangzhou numeral three
case 0x3024: // Hangzhou numeral four
case 0x3025: // Hangzhou numeral five
case 0x3026: // Hangzhou numeral six
case 0x3027: // Hangzhou numeral seven
case 0x3028: // Hangzhou numeral eight
case 0x3029: // Hangzhou numeral nine
ch = (ch & 0x000F) + L'0';
break;
// ONE is at ZERO location... careful
case 0x3220: // parenthesized ideograph one
case 0x3221: // parenthesized ideograph two
case 0x3222: // parenthesized ideograph three
case 0x3223: // parenthesized ideograph four
case 0x3224: // parenthesized ideograph five
case 0x3225: // parenthesized ideograph six
case 0x3226: // parenthesized ideograph seven
case 0x3227: // parenthesized ideograph eight
case 0x3228: // parenthesized ideograph nine
case 0x3280: // circled ideograph one
case 0x3281: // circled ideograph two
case 0x3282: // circled ideograph three
case 0x3283: // circled ideograph four
case 0x3284: // circled ideograph five
case 0x3285: // circled ideograph six
case 0x3286: // circled ideograph seven
case 0x3287: // circled ideograph eight
case 0x3288: // circled ideograph nine
ch = (ch & 0x000F) + L'1';
break;
case 0x3007: // ideographic number zero
case 0x24EA: // circled number zero
ch = L'0';
break;
default:
if (0xFF01 <= ch // fullwidth exclamation mark
&& ch <= 0xFF5E) // fullwidth tilde
{
// the fullwidths line up with ASCII low subset
ch = ch & 0xFF00 + L'!' - 1;
}
else if (0x2460 <= ch // circled one
&& ch <= 0x2468) // circled nine
{
ch = ch - 0x2460 + L'1';
}
else if (0x2474 <= ch // parenthesized one
&& ch <= 0x247C) // parenthesized nine
{
ch = ch - 0x2474 + L'1';
}
else if (0x2488 <= ch // one full stop
&& ch <= 0x2490) // nine full stop
{
ch = ch - 0x2488 + L'1';
}
else if (0x249C <= ch // parenthesized small a
&& ch <= 0x24B5) // parenthesized small z
{
ch = ch - 0x249C + L'a';
}
else if (0x24B6 <= ch // circled capital A
&& ch <= 0x24CF) // circled capital Z
{
ch = ch - 0x24B6 + L'A';
}
else if (0x24D0 <= ch // circled small a
&& ch <= 0x24E9) // circled small z
{
ch = ch - 0x24D0 + L'a';
}
else if (0x2500 <= ch // box drawing (begin)
&& ch <= 0x257F) // box drawing (end)
{
ch = L'|';
}
else if (0x2580 <= ch // block elements (begin)
&& ch <= 0x259F) // block elements (end)
{
ch = L'#';
}
else if (0x25A0 <= ch // geometric shapes (begin)
&& ch <= 0x25FF) // geometric shapes (end)
{
ch = L'*';
}
else if (0x2600 <= ch // dingbats (begin)
&& ch <= 0x267F) // dingbats (end)
{
ch = L'.';
}
else
ValidUnicode(ch); // validate that it's legit Unicode
break;
}
}
}
STDMETHODIMP CTextExtractor::ExtractText(BSTR fileName, long maxLength, BSTR * fileText)
{
if (NULL == fileName)
return E_POINTER;
if (0 == ::SysStringLen(fileName))
return E_INVALIDARG;
if (NULL == fileText)
return E_POINTER;
*fileText = NULL;
HRESULT hr = E_UNEXPECTED;
try
{
CComPtr<IUnknown> spIUnk;
hr = LoadIFilter(fileName, NULL, reinterpret_cast<void**>(&spIUnk));
if (SUCCEEDED(hr))
{
CComQIPtr<IFilter> spIFilter = spIUnk;
if (spIFilter)
{
DWORD dwFlags = 0;
hr = spIFilter->Init(IFILTER_INIT_CANON_PARAGRAPHS |
IFILTER_INIT_CANON_HYPHENS |
IFILTER_INIT_CANON_SPACES |
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
IFILTER_INIT_INDEXING_ONLY,
0, NULL, &dwFlags);
AtlTrace(_T("Init() hr=%x, dwFlags=%x\n"), hr, dwFlags);
if (SUCCEEDED(hr))
{
std::wstringstream wout;
STAT_CHUNK statChunk;
memset(&statChunk, 0, sizeof(statChunk));
bool moreChunks = true;
while (moreChunks)
{
if (maxLength && wout.tellp() > maxLength)
{
hr = S_FALSE;
break;
}
hr = spIFilter->GetChunk(&statChunk);
AtlTrace(_T("GetChunk() hr=%x, breakType=%d, flags=%x\n"), hr, statChunk.breakType, statChunk.flags);
if (SUCCEEDED(hr))
{
// ignore non-text chunks...
if (CHUNK_TEXT != (CHUNK_TEXT & statChunk.flags))
continue;
switch (statChunk.breakType)
{
case CHUNK_NO_BREAK:
break;
case CHUNK_EOW:
wout << L' ';
break;
case CHUNK_EOS:
case CHUNK_EOC:
case CHUNK_EOP:
wout << L"\r\n";
break;
}
bool moreText = true;
while (moreText)
{
static const int cChunkSize = 4096;
wchar_t buf[cChunkSize + 1];
unsigned long chBuf = cChunkSize;
memset(buf, 0, sizeof(buf));
hr = spIFilter->GetText(&chBuf, buf);
AtlTrace(_T("GetText() hr=%x, chBuf=%d\n"), hr, chBuf);
if (SUCCEEDED(hr))
{
CleanUpCharacters(chBuf, buf);
wout << buf;
if (maxLength && wout.tellp() > maxLength)
{
hr = S_FALSE;
moreText = false;
}
if (FILTER_S_LAST_TEXT == hr)
{
hr = S_OK;
moreText = false;
}
}
else
{
switch (hr)
{
case FILTER_E_NO_MORE_TEXT:
hr = S_OK;
moreText = false;
break;
case FILTER_E_NO_TEXT:
return Error("GetText: The current chunk does not contain text.", __uuidof(TextExtractor), hr);
default:
return Error("GetText: Unexpected error.", __uuidof(TextExtractor), hr);
}
}
}
}
else
{
switch (hr)
{
case FILTER_E_EMBEDDING_UNAVAILABLE:
case FILTER_E_LINK_UNAVAILABLE:
continue; // next chunk...
case FILTER_E_END_OF_CHUNKS:
hr = S_OK;
moreChunks = false;
continue;
case FILTER_E_PASSWORD:
return Error("GetChunk: Password or other security-related access failure.", __uuidof(TextExtractor), hr);
case FILTER_E_ACCESS:
return Error("GetChunk: Access failure.", __uuidof(TextExtractor), hr);
default:
return Error("GetChunk: Unexpected error.", __uuidof(TextExtractor), hr);
}
}
}
*fileText = SysAllocString(wout.str().c_str());
}
else
{
switch (hr)
{
case E_FAIL:
return Error("Init: File filter for this file type not found.", __uuidof(TextExtractor), hr);
case E_INVALIDARG:
return Error("Init: Count and contents of attributes do not agree.", __uuidof(TextExtractor), hr);
case FILTER_E_PASSWORD:
return Error("Init: Access has been denied because of password protection or similar security measures.", __uuidof(TextExtractor), hr);
case FILTER_E_ACCESS:
return Error("Init: Unable to access file.", __uuidof(TextExtractor), hr);
default:
return Error("Init: Unexpected error.", __uuidof(TextExtractor), hr);
}
}
}
else
{
return Error("Unable to query for IFilter interface.", __uuidof(TextExtractor), E_FAIL);
}
}
else
{
switch (hr)
{
case E_ACCESSDENIED:
return Error("LoadIFilter: Access denied to the filter file.", __uuidof(TextExtractor), hr);
case E_HANDLE:
return Error("LoadIFilter: Invalid handle, probably due to a low-memory situation.", __uuidof(TextExtractor), hr);
case E_INVALIDARG:
return Error("LoadIFilter: Invalid parameter.", __uuidof(TextExtractor), hr);
case E_OUTOFMEMORY:
return Error("LoadIFilter: Insufficient memory or other resources to complete the operation.", __uuidof(TextExtractor), hr);
case E_FAIL:
return Error("LoadIFilter: Unknown error.", __uuidof(TextExtractor), hr);
case FILTER_E_PASSWORD:
return Error("LoadIFilter: Access has been denied because of password protection or similar security measures.", __uuidof(TextExtractor), hr);
case FILTER_E_ACCESS:
return Error("LoadIFilter: Unable to access file.", __uuidof(TextExtractor), hr);
default:
return Error("LoadIFilter: Unexpected error.", __uuidof(TextExtractor), hr);
}
}
}
catch (...)
{
return Error("Unexpected exception", __uuidof(TextExtractor), E_FAIL);
}
return hr;
}