forked from internetarchive/dweb-mirror
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ArchiveItemPatched.js
1201 lines (1148 loc) · 55.2 KB
/
ArchiveItemPatched.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* eslint-disable func-names, no-use-before-define, no-inner-declarations, camelcase, consistent-return, no-unused-vars */
/* eslint-disable indent, object-property-newline, implicit-arrow-linebreak, object-curly-newline, block-spacing, no-lonely-if */
/*
// Monkey patches dweb-archivecontroller,
// Note cant merge into dweb-archivecontroller as wont work in browser; and cant create subclass as want everywhere e.g. archivefile.fetch_metadata is used to use the cache
*/
// NPM repos
const path = require('path');
const debug = require('debug')('dweb-mirror:ArchiveItem');
const canonicaljson = require('@stratumn/canonicaljson');
const waterfall = require('async/waterfall');
const each = require('async/each'); // https://caolan.github.io/async/docs.html#each
const parallel = require('async/parallel'); // https://caolan.github.io/async/docs.html#parallel
const map = require('async/map'); // https://caolan.github.io/async/docs.html#map
// Other IA repos
const { ArchiveItem, ArchiveMember, dwebMagnetLinkFrom, RawBookReaderResponse, parmsFrom, ObjectFromEntries, specialidentifiers } = require('@internetarchive/dweb-archivecontroller');
// Other files from this repo
const MirrorFS = require('./MirrorFS');
/**
* Common arguments across all API functions
*
* copyDirectory points at top level of a cache where want a copy
* relFilePath path to file or item inside a cache IDENTIFIER/FILENAME
* noCache ignore anything in the cache - forces re-fetching and may cause upstream server to cache it TODO-API check this is not obsoleted by separate read and write skipping
* noStore do not store results in cache
* skipFetchFile as an argument causes file fetching to be suppressed (used for testing only)
* skipNet do not try and use the net for anything
* wantStream Return results as a stream, just like received from the upstream.
* wantSize Return the size as a byte-count.
* copyDirectory Specify alternate directory to store results in rather than config.directories[0]
* darkOk True if a dark item is a valid response (if false, and item is dark will throw an error)
* cb(err, res) Unless otherwise documented callbacks return an error, (subclass of Error) or null, and optional return data.
* Some functions also support an absent cb as returning a Promise, otherwise cb is required
* feel free to add Promise support to any function lacking it, search for "Promise pattern v2" for examples of how to do this consistently.
*/
function traceStream(s, { name = '', func = '' } = {}) {
if (s) { // Only trace if its a stream (simplifies calling)
// Note there is a side-effect of this, to cancel the unhandled error exception in core-modules/events.js/EventEmitter.prototype.emit
s.once('error', err => {
debug('Tracing error on stream %s %s %o', name, func, err); // TODO change to %s err.message when solid
});
}
}
// SEE ALMOST-SAME-CODE-NAMEPART in ArchiveMember._namepart and ArchiveItem._namepart
// noinspection JSUnresolvedVariable
ArchiveItem.prototype._namepart = function () {
// The name used for the directory and file prefixes, normally the item identifier, but some special cases
if (!this.identifier && this.query) {
// Goal here is a string that: gives an indication of what it is; is filesystem safe; doesnt map similar but different queries to same string
// Npm's sanitize-filename does a reasonable job BUT it maps all unsafe chars to same result,
// encodeURLcomponent probably does a reasonable job, except for *
return encodeURIComponent(`_SEARCH_${this.query}_${this.sort.join('_')}`).replace(/\*/g, '%2A');
} else if (this.identifier) {
return this.identifier;
} else {
return undefined; // Should be caught at higher level to decide not to use cache
}
};
function _save1file(key, obj, namepart, { copyDirectory = undefined }, cb) {
// Returns nothing
const relFilePath = path.join(namepart, `${namepart}_${key}.json`);
if (typeof obj === 'undefined') {
cb(null);
} else {
MirrorFS.writeFile({ relFilePath, copyDirectory }, canonicaljson.stringify(obj), (err) => {
if (err) {
debug('ERROR Unable to write %s to %s', key, relFilePath);
cb(err);
} else {
cb(null);
}
});
}
}
/**
* Save metadata for this file as JSON in multiple files (see File Outline)
*
* If not already done so, will `fetch_metadata` (but not query, as that may want to be precisely controlled)
*
* @param copyDirectory
* @param cb(err, this) Errors if cant fetch metadata, or save failed
*/
ArchiveItem.prototype.save = function ({ copyDirectory = undefined } = {}, cb) {
/* SEE-OTHER-ADD-METADATA-API-TOP-LEVEL in dweb-mirror and dweb-archivecontroller
Save metadata for this file as JSON in multiple files.
.metadata -> <IDENTIFIER>_meta.json
.members -> <IDENTIFIER>_members.json
.reviews -> <IDENTIFIER>_reviews.json
.speech_vs_music_asr => <IDENTIFIER>_speech_vs_music_asr.json
.files -> <IDENTIFIER>_files.json
.playlist -> <IDENTIFIER>_playlist.json
{collection_titles, collection_sort_order, dir, files_count, is_dark, server} -> <IDENTIFIER>.extra.json
and .member_cached.json is saved from ArchiveMember not from ArchiveItems
If not already done so, will `fetch_metadata` (but not query, as that may want to be precisely controlled)
*/
if (!this.identifier) {
// Must be a Search so do not try and save files - might save members
debug('Search so not saving');
cb(null, this);
} else {
const namepart = this._namepart();
// Note all these files should be in MirrorFS.isSpecialFile
each(
// SEE-OTHER-ADD-METADATA-API-TOP-LEVEL in dweb-mirror and dweb-archivecontroller
[
['meta', this.metadata], // Maybe empty if is_dark
['members', this.membersFav], // Only save Favorited members
['files', this.exportFiles()],
['extra', ObjectFromEntries(
ArchiveItem.extraFields.map(k => [k, this[k]])
.filter(kv => !!kv[1])
)], // NOTE DUPLICATE OF LINE IN fetch_query and save
['reviews', this.reviews],
['speech_vs_music_asr', this.speech_vs_music_asr],
['playlist', this.playlist], // Not this is a cooked playlist, but all cooking is additive
],
(i, cbInner) => { // [ part, obj ]
_save1file(i[0], i[1], namepart, { copyDirectory }, cbInner);
},
(err) => { if (err) { cb(err); } else { cb(null, this); } }
);
}
};
/**
* Save `.bookreader` to `IDENTIFIER.bookreader.json`.
*
* If `.bookreader` is undefined it will attempt to retrieve first.
*/
// noinspection JSUnresolvedVariable
ArchiveItem.prototype.saveBookReader = function ({ copyDirectory = undefined } = {}, cb) {
/*
Save BookReader for this file as JSON
.bookreader -> <IDENTIFIER>.bookreader.json =
*/
if (!this.identifier) {
// Must be a Search so do not try and save files or bookreader - might save members
debug('Search so not saving bookReader');
cb(null, this);
} else {
const namepart = this._namepart(); // Its also in this.item.metadata.identifier but only if done a fetch_metadata
if (!(this.bookreader || this.is_dark)) {
// noinspection JSUnusedLocalSymbols
this.fetch_bookreader({ copyDirectory }, (err, unusedAi) => {
if (err) {
debug('ERROR: Cant save because could not fetch bookreader for %s: %s', this.identifier, err.message);
cb(err);
} else {
f.call(this); // Need the call because it loses track of "this"
}
});
} else {
f.call(this);
}
function f() {
// noinspection JSPotentiallyInvalidUsageOfThis
// Note all these files should be in MirrorFS.isSpecialFile
_save1file('bookreader', this.bookreader, namepart, { copyDirectory }, (err) => { if (err) { cb(err); } else { cb(null, this); } });
}
}
};
function _parse_common(namepart, part, { copyDirectory = undefined }, cb) {
const relFilePath = path.join(namepart, `${namepart}_${part}.json`);
MirrorFS.readFile(relFilePath, { copyDirectory }, (err, jsonstring) => {
if (err) {
cb(err); // Not logging as not really an err for there to be no file, as will read
} else if (jsonstring.length === 0) { // Zero length files shouldnt occur, but seem to especially if crawler exits prematurely. ignore them.
const err2 = new Error('File %{relFilePath} is empty so ignoring it');
debug('ERROR in parsing %s %s', namepart, err2.message);
cb(err2);
} else {
let o;
try {
o = canonicaljson.parse(jsonstring); // No reviver function, which would allow postprocessing
} catch (err2) {
// It is on the other hand an error for the JSON to be unreadable
debug('Failed to parse json at %s: part %s %s', namepart, part, err2.message);
cb(err2);
return;
}
cb(null, o);
}
});
}
/**
* Read metadata, playlist, reviews, files and extra from corresponding files - see `Files on disk`
* cb(err, {files, files_count, metadata, reviews, collection_titles, dir, speech_vs_music_asr, is_dark, server}) data structure fields of ArchiveItem
**/
ArchiveItem.prototype.read = function ({ copyDirectory = undefined }, cb) {
const namepart = this.identifier;
const res = {};
function _parse(part, cb1) { _parse_common(namepart, part, { copyDirectory }, cb1); }
// This is a set of parallel reads, failure of some cause the whole thing to fail; some require postprocessing; and playlist occurs after metadata&files succeed
parallel([
// SEE-OTHER-ADD-METADATA-API-TOP-LEVEL in dweb-mirror and dweb-archivecontroller
cb1 => _parse('meta', (err, o) => {
res.metadata = o;
if (err) {
cb1(err);
} else if (['audio', 'etree', 'movies'].includes(res.metadata.mediatype)) {
_parse('playlist', (err1, o1) => {
res.playlist = o1; // maybe undefined
cb1(err1); // Should fail if no playlist, so re-reads from server and gets playlist
});
} else {
cb1(null);
}
}),
cb1 => _parse('files', (err, o) => {
// Note that downloaded is stored here in o.x.downloaded but pushed up by AF.constructor to AF.downloaded instead of AF.metadata.downloaded
if (!err) { res.files = o; res.files_count = res.files.length; }
cb1(err);
}),
cb1 => _parse('reviews', (err, o) => {
res.reviews = o; // Undefined if failed but not an error
cb1(null);
}),
cb1 => _parse('speech_vs_music_asr', (err, o) => {
res.speech_vs_music_asr = o; // Undefined if failed but not an error
cb1(null);
}),
cb1 => _parse('members', (err, o) => {
res.membersFav = o; // Undefined if failed but not an error
cb1(null);
}),
cb1 => _parse('extra', (err, o) => {
// Unavailable on archive.org but there on dweb.archive.org: collection_titles
// Not relevant on dweb.archive.org, d1, d2, item_size, uniq, workable_servers
// Its absence should be considered an error as "servers" etc are required for bookreader.
Object.assign(res, o); // Note this could have the bad download=null, but will be filtered through loadFromMetadataAPI
cb1(err);
}),
], (err, unused) =>
cb(err, res));
};
/**
* Read bookreader data from file and place in bookreader field on item
* File has: `{ data, brOptions, lendingInfo, possibly metadata }`
* item has bookreader: { data, brOptions, lendingInfo }
* API returns { data: { data, brOptions, lendingInfo, possibly metadata } }
* cb(err, {data { data, metadata, brOptions, lendingInfo, metadata}} format returned from BookReader api
*/
ArchiveItem.prototype.read_bookreader = function ({ copyDirectory = undefined }, cb) {
const namepart = this.identifier; // Possible undefined
function _parse(part, cb1) { _parse_common(namepart, part, { copyDirectory }, cb1); }
_parse('bookreader', (err, o) => { // { data, brOptions, lendingInfo }
if (err) {
cb(err);
} else {
o.metadata = this.metadata;
cb(null, new RawBookReaderResponse({ data: o }));
}
});
};
/**
* Fetch the bookreader data for this item if it hasn't already been.
* More flexible version than dweb-archive.ArchiveItem
* Monkey patched into dweb-archive.ArchiveItem so that it runs anywhere that dweb-archive attempts to fetch_bookreader
* opts = {
* noCache Do not check cache, refetch from server, and store locally
* noStore Do not store result
* copyDirectory Where to store result if not default
* }
* Alternatives/Strategy:
* cached: return from cache
* !cached: Load from net, save to cache
*
* cb(err, this) or if undefined, returns a promise resolving to 'this'
* Errors TransportError (404)
* Result is ai.bookreader = { brOptions, data, lendingInfo}
**/
ArchiveItem.prototype.fetch_bookreader = function (opts = {}, cb) { // TODO-API
if (typeof opts === 'function') { cb = opts; opts = {}; } // Allow opts parameter to be skipped
const { noCache, noStore, copyDirectory = undefined } = opts;
// noinspection JSUnresolvedVariable
if (cb) {
try { f.call(this, cb); } catch (err) {
cb(err);
}
} else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) { reject(err); } else { resolve(res); } }); } catch (err) { reject(err); } }); } // Promisify pattern v2
function readAndLoad(cb1) {
this.read_bookreader({ copyDirectory }, (err, bookReaderApi) => { // RawBookReaderResponse = { data: { data, brOptions, lendingInfo }}
if (bookReaderApi) {
this.loadFromBookreaderAPI(bookReaderApi);
}
cb1(err, !!copyDirectory);
});
}
function tryReadThenNet(cb1) {
readAndLoad.call(this, (err) => {
if (err) {
this._fetch_bookreader(opts, (err1, unusedRes) => cb1(err1, true)); // Will process and add to this.bookreader, but want to save as came from net
} else {
cb1(null, !!copyDirectory); // If copyDirectory explicitly specified then save to it, otherwise its from file so no need to save.
}
});
}
function tryNetThenRead(cb1) {
this._fetch_bookreader(opts, (err, unusedRes) => {
if (err) {
readAndLoad.call(this, cb1);
} else {
cb1(err, true); // Will process and add to this.bookreader, but want to save as came from net
}
});
}
function tryReadOrNet(cb1) { // Try both files and net, cb1(err, doSave)
if (noCache) {
tryNetThenRead.call(this, cb1);
} else {
tryReadThenNet.call(this, cb1);
}
}
function trySave(doSave, cb1) {
if (!noStore && doSave) {
this.saveBookReader({ copyDirectory }, cb1);
} else {
cb1(null);
}
}
function f(cb1) {
if (this.is_dark && !opts.darkOk) {
cb1(new Error(`item ${this.identifier} is dark`));
} else if (this.identifier && !this.bookreader) { // Check haven't already loaded or fetched metadata
waterfall([
tryReadOrNet.bind(this),
trySave.bind(this)
], (err) => cb1(err, this));
} else {
cb1(null, this);
}
}
};
/**
* Fetch a page from the item, caching it
*
* @param zip Name of file holding the image
* @param file file within zip
* @param scale factor to shrink raw image by (2 is about right for a full screen) floats will be quantized
* @param rotate 0 for normal, unsure what other values are
* @param page usually "cover_t.jpg" to get the page
* or leaf1_w2000 - from mediawiki
* or leaf1 - from BookreaderPreview.php/... when book is lendable, not free
* other parameters - see common Arguments above
* @param cb(err, data || stream || size) returns either data, or if wantStream then a stream
*/
// noinspection JSUnresolvedVariable
ArchiveItem.prototype.fetch_page = function ({
wantStream = false, wantSize = false, noCache = false,
zip = undefined, file = undefined, scale = undefined, rotate = undefined,
page = undefined, skipNet = false, skipFetchFile = undefined,
itemPath = undefined, subPrefix = undefined, id = undefined,
copyDirectory = undefined
} = {}, cb) { // TODO-API noCache
let zipfile;
debug('fetch_page:%s%s subPrefix=%s zip=%s file=%s page=%s scale=%s rotate=%s id=%s', skipNet ? ' (skipNet)' : '', skipFetchFile ? ' (skipFetchFile)' : '', subPrefix, zip, file, page, scale, rotate, id);
// page = cover_t.jpg - bookreader cover page
// page=leaf1_w2000 meaning page 1, with ideal width 2000 pixels, comes from Palmleaf wiki
// page=leaf1 scale=10.1234 subPrefix=IDENTIFIER zipfile=undefined from BookReaderPreview call made when book unavailable
waterfall([
(cbw) =>
this.fetch_metadata({ copyDirectory }, cbw),
(ai, cbw) =>
this.fetch_bookreader({ copyDirectory }, cbw),
(ai, cbw) => {
if (page) {
if (page.startsWith('leaf')) {
const [l, w] = page.split('_w');
const pageManifest = this.pageManifestFrom({ leafNum: parseInt(l.slice(4), 10) });
const pageParms = this.pageParms(pageManifest, { idealWidth: parseInt(w, 10), scale });
zip = pageParms.zip;
file = pageParms.file;
scale = pageParms.scale; // quantized to 2^n by pageQuantizedScale()
}
} else { // There is no scale if page & !"leaf..."
scale = this.pageQuantizedScale(scale); // // quantized to 2^n by pageQuantizedScale()
}
if (zip) zipfile = zip.split('/')[4];
// Reconstruct url as we will quantize the scale
const urls = (zip && file)
? `https://www-dweb-cors.dev.archive.org/BookReader/BookReaderImages.php?${parmsFrom({ zip, file, scale, rotate, id })}`
: page
? `https://www-dweb-cors.dev.archive.org/BookReader/BookReaderPreview.php?${parmsFrom({ subPrefix, page, scale, rotate, id: this.identifier, itemPath: this.dir, server: 'www-dweb-cors.dev.archive.org' })}`
: undefined; // This would be an error
if (!urls) {
debug('Failure to build URLs for bookreader %o', { identifier: this.identifier, zip, file, page, scale, rotate });
cbw(new Error('insufficient info to build URL'));
} else {
const debugname = `${this.identifier}_${file}`;
const relFilePath = `${this.identifier}/_pages/` + (page || `${zipfile}/scale${Math.floor(scale)}/rotate${rotate}/${file}`);
if (!(scale && file)) { // This is the cover or a leaf of a preview, its not scaled or rotated
MirrorFS.cacheAndOrStream({
urls, wantStream, wantSize, debugname, noCache, relFilePath, skipNet, copyDirectory
}, cbw);
} else { // Looking for page by file name with scale and rotation
// Strategy is complex:
// First check for a file of the scale or larger -> reFilePath2
// Try Streaming - either from relFilePath2 or urls
// If that fails see if we have a file again but this time with 'bestEffort' which will accept smaller files -> relFilePath3
// If we find it succeeds stream it from relFilePath3
// Else we don't have any versions of this page, and failed to stream, so its an error
MirrorFS.checkWhereValidFileRotatedScaled({ file, scale, rotate, noCache, copyDirectory, // Find which valid scale/rotate we have,
relFileDir: `${this.identifier}/_pages/${zipfile}` },
(err, relFilePath2) => { // undefined if not found
// Use this filepath if find an appropriately scaled one, otherwise use the one we really want from above
// TODO there is an edge case where find wrongly scaled file, but if copydir is set we'll copy that to relFilePath
MirrorFS.cacheAndOrStream({ urls, wantStream, wantSize, debugname, noCache, skipNet, skipFetchFile, copyDirectory, relFilePath: relFilePath2 || relFilePath },
(err1, res) => {
if (err1) {
MirrorFS.checkWhereValidFileRotatedScaled({
file, scale, rotate, noCache, copyDirectory, // Find which valid scale/rotate we have,
relFileDir: `${this.identifier}/_pages/${zipfile}`,
bestEffort: true
},
(err2, relFilePath3) => { // undefined if cant find any versions of this page (including smaller)
if (err2 || !relFilePath3) {
cbw(err1); // Return error from cacheAndOrStream
} else {
MirrorFS.cacheAndOrStream({
urls, wantStream, wantSize, debugname, noCache, skipNet: true, skipFetchFile, copyDirectory, relFilePath: relFilePath3
}, cbw);
}
});
} else { // Found it
cbw(null, res);
}
});
});
}
}
}
], cb);
};
ArchiveItem.prototype.fetch_metadata = function (opts = {}, cb) { // TODO-API opts:cacheControl
/*
Fetch the metadata for this item if it hasn't already been, cache Locally
More flexible version than dweb-archive.ArchiveItem
Monkey patched into dweb-archive.ArchiveItem so that it runs anywhere that dweb-archive attempts to fetch_metadata
Note that it adds information about the crawl and downloaded status
Alternatives:
opts { noStore, noCache, darkOk, skipNet, copyDirectory } - see common args at top of this file
cached: return from cache
!cached: Load from net, save to cache
cb(err, this) or if undefined, returns a promise resolving to 'this'
Errors TransportError (404)
TODO-CACHEAGING - check on age of cache
*/
if (typeof opts === 'function') { cb = opts; opts = {}; } // Allow opts parameter to be skipped
// noinspection JSUnresolvedVariable
const { copyDirectory } = opts;
/* eslint-disable-next-line */
if (cb) { try { f.call(this, cb); } catch(err) { cb(err); }} else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) {reject(err);} else {resolve(res);} });} catch(err) {reject(err)}})} // Promisify pattern v2
function tryRead(cb1) { // Try and read from disk, obeying options
this.read({ copyDirectory }, (err, metadata) => {
if (err) {
cb1(err);
} else {
this.loadFromMetadataAPI(metadata); // Saved Metadata will have processed Fjords and includes the reviews, files, and other fields of _fetch_metadata()
cb1(null);
}
});
}
function tryNet(cb1) { // Try and read from net, obeying options
if (opts.skipNet) {
cb1(new Error('skipNet set'));
} else {
// Note _fetch_metadata will expand specialidentifiers
this._fetch_metadata(Object.assign({}, opts, { darkOk: true }), (err, unusedAI) => { // Process Fjords and load .metadata and .files etc - allow is_dark just throw before caller
cb1(err); // Maybe or maybe not err
});
}
}
// Try Read or Net - order depends on noCache, throws error if could not read it, or get from net.
// returns true if should save the result locally
function tryReadOrNet(cb1) {
if (!this.identifier || this.metadata || this.is_dark) { // Check haven't already loaded or fetched metadata (is_dark wont have a .metadata)
cb1(null, false); // Didnt fetch so nothing to save
} else if (opts.noCache) { // Can remove that check in tryRead
tryNet.call(this, (err) => {
if (!err) {
cb1(null, true);
} else {
tryRead.call(this, (unusedErr1) => {
// cached but check for explicit requirement to copy
cb1(null, (!!copyDirectory) && (!Object.keys(specialidentifiers).includes(this.identifier)));
});
}
});
} else { // Try the cache, then the net
tryRead.call(this, (err) => {
if (err) { // noCache, or not cached
tryNet.call(this, (err1) => {
if (err1) {
cb1(new Error(`Unable to fetch metadata locally: ${err.message} or from net ${err1.message}`), false);
} else {
cb1(null, true);
}
});
} else {
// cached but check for explicit requirement to copy
cb1(null, (!!copyDirectory) && (!Object.keys(specialidentifiers).includes(this.identifier)));
}
});
}
}
function trySave(doSave, cb1) { // If requested, try and save, obeying options
if (!doSave || opts.noStore) {
cb1(null);
} else {
this.save({ copyDirectory }, cb1);
}
}
function f(cb1) {
if (this.identifier && !(this.metadata || this.is_dark)) { // If have not already fetched (is_dark means no .metadata field)
tryReadOrNet.call(this, (err, doSave) => {
if (err) {
cb1(err, this);
} else {
trySave.call(this, doSave, (unusedErr) => {
// ignore errors - on saving (for example if no disk), they will or should have been reported.
cb1((this.is_dark && !opts.darkOk) ? new Error(`item ${this.identifier} is dark`) : null, this);
});
}
});
} else {
cb1(null, this);
}
}
};
// noinspection JSUnresolvedVariable
/**
* Fetch the next page of the query for this item.
* A more flexible version than dweb-archive.ArchiveItem.fetch_query
* which is monkey patched into dweb-archive.ArchiveItem so that it runs anywhere that dweb-archive attempts to fetch_query.
* @param opts {
* skipNet, noCache, noStore, see common argument documentation at top of this file
* wantFullResp, copyDirectory see _fetch_query
* }
* @param cb
* @returns {Promise<unknown>}
* Strategy is:
* Read <IDENTIFIER>_members_cached.json if it exists into .members
* Expand each of `.members` from its `<IDENTIFIER>_member.json` if necessary and file exists.
* Run _fetch_query which will also handled fav-*'s `members.json` files, and `query` metadata field.
* Write the result back to `<IDENTIFIER>_members_cached.json`
* Write each member to its own `<IDENTIFIER>_member.json`
*/
ArchiveItem.prototype.fetch_query = function (opts = {}, cb) {
if (typeof opts === 'function') { cb = opts; opts = {}; } // Allow opts parameter to be skipped
/* eslint-disable-next-line prefer-const */ /* as cant have let and const mixed in destructuring */
let { noCache, noStore, skipNet, copyDirectory } = opts;
noCache = noCache || !(copyDirectory || MirrorFS.directories.length);
noStore = noStore || !(copyDirectory || MirrorFS.directories.length);
if (cb) { try { f.call(this, cb); } catch (err) { cb(err); } } else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) { reject(err); } else { resolve(res); } }); } catch (err) { reject(err); } }); } // Promisify pattern v2
function f(cb0) {
function expandLocally(arrAM, cb1) {
// Expand the members locally, errors are ignored
// unexpanded members typically come from:
// either a direct req from client to server for identifier:...
// or for identifier=fav-* when members loaded with unexpanded
// Note this does not obey noCache, consumer should.
// Doesnt return errors, its ok if cant expand
map(arrAM, // parallel async mapping
(am, cb2) => {
if (am.isExpanded()) { // Already expanded
cb2(null, am);
} else {
am.read({ copyDirectory }, (err, o) => cb2(null, o ? new ArchiveMember(o) : am));
}
},
cb1);
}
function readAndExpandMembersSearch(cb1) {
// Expand the membersSearch if necessary and possible locally, errors are ignored
// unexpanded members typically come from either a direct req from client to server for identifier:...
// Note this does not obey noCache, consumer should.
// Do not pass errors back, its ok not to read, or not to expand
_parse_common(namepart, part, { copyDirectory }, (err, arrObj) => {
if (err) {
this.membersSearch = [];
cb1();
} else {
expandLocally(arrObj.map(o => new ArchiveMember(o, { unexpanded: !o.publicdate })), (neverErr, arrAM) => {
this.membersSearch = arrAM;
cb1();
});
}
});
}
// Try and read extras file which for search will contain numFound (it wont have been read by fetch_metadata because no identifier)
function readExtras(cb1) {
_parse_common(namepart, 'extra', { copyDirectory }, (err, o) => {
if (!err) {
this._mergeExtra(o);
}
cb1();
});
}
function saveResults(cb1) {
// arr will be matching ArchiveMembers, possibly wrapped in Response (depending on opts) or undefined if not a collection or search
// fetch_query.members will have the full set to this point (note .files is the files for the item, not the ArchiveItems for the search)
if (noStore) {
cb1();
} else {
parallel([
cb2 => {
if (!(this.membersSearch && this.membersSearch.length && namepart)) {
cb2();
} else {
// Just store membersSearch, but pass on full set with possible response
each([
['extra', ObjectFromEntries(
ArchiveItem.extraFields.map(k => [k, this[k]])
.filter(kv => !!kv[1])
)], // NOTE DUPLICATE OF LINE IN fetch_query and save
[part, this.membersSearch] // part is e.g. members_cached or members_-titleSorter_cached
],
(i, cbInner) => { // [ part, obj ]
_save1file(i[0], i[1], namepart, { copyDirectory }, cbInner);
},
cb2);
}
},
// Save (expanded) membersSearch to their cache
cb2 => {
each(this.membersFav.concat(this.membersSearch).filter(ams => ams.isExpanded()),
(ams, cb3) => ams.save({ copyDirectory }, (unusederr) => cb3(null)), // Ignore errors saving
cb2);
}],
cb1);
}
}
// TODO-CACHE-AGING
// noinspection JSUnresolvedVariable
const namepart = this._namepart(); // Can be undefined for example for list of members unconnected to an item
// TODO - this is wrong, this.sort can sometimes be an array
const sortString = (this.sort.length === 0) ? undefined : !Array.isArray(this.sort) ? this.sort : this.sort.join('_');
const defaultSort = (!sortString // Unspecified
|| (sortString === this.defaultSortArr().join('_'))); // Check if its non-default sort
const part = 'members_' + (defaultSort ? 'cached' : (sortString + '_cached'));
if (!Array.isArray(this.membersFav)) this.membersFav = [];
// TODO-SEARCHORDER check what happens when switch tabs, at this point membersSearch should be empty
if (!Array.isArray(this.membersSearch)) this.membersSearch = [];
waterfall([
// In parallel Read members (if !noCache); extras and expand membersFav
(cb2) => parallel([
// Read from members_cached.json files and expand
(cb3) => { if (!namepart || noCache) { cb3(); } else { readAndExpandMembersSearch.call(this, cb3); } },
// Read extras unless its a search (!namepart)
(cb3) => { if (!namepart) { cb3(); } else { readExtras.call(this, cb3); } },
// expand membersFav
(cb3) => expandLocally.call(this, this.membersFav, (neverErr, arrAM) => {
this.membersFav = arrAM; cb3(null);
}),
], (neverErr, unusedRes) => cb2()),
// _fetch_query will optimize, it tries to expand any unexpanded members, and only does the query if needed (because too few pages retrieved)
// unexpanded members are a valid response - client should do what it can to display them.
(cb2) => {
if (skipNet) {
cb2(null, this.currentPageOfMembers(opts.wantFullResp)); // This page of members
} else {
this._fetch_query(opts, cb2); // arr of search result or slice of existing members
}
},
// If fetch from net probably failed and didn't use cache, fall back
(res, cb2) => {
if (!(noCache && this.currentPageOfMembersFail())) {
cb2(null, res);
} else {
readAndExpandMembersSearch.call(this, (neverErr) => {
cb2(null, this.currentPageOfMembers(opts.wantFullResp));
});
}
},
// Save locally
(res, cb2) => saveResults.call(this, () => { cb2(null, res); }),
// res will be a slice into (membersFav + membersSearch) possibly wrapped in Response (depending on opts) o
// fetch_query.membersSearch will have the full set to this point (note .files is the files for the item, not the ArchiveItems for the search)
],
(err, res) => cb0(err, res));
}
};
// noinspection JSUnresolvedVariable
ArchiveItem.prototype.saveThumbnail = function ({
skipFetchFile = false, noCache = false, wantStream = false, copyDirectory = undefined
} = {}, cb) {
/*
Save a thumbnail to the cache, note must be called after fetch_metadata
wantStream true if want stream instead of ArchiveItem returned
skipFetchFile true if should skip net retrieval - used for debugging
noCache true to skip reading cache
cb(err, this)||cb(err, stream) Callback on completion with self (mirroring), or on starting with stream (browser)
*/
const namepart = this.identifier; // Its also in this.metadata.identifier but only if done a fetch_metadata
if (!namepart || Object.keys(specialidentifiers).includes(namepart)) { // Skip thumbnail if no identifier, or special with no thumbnail
cb(null, wantStream ? undefined : this);
} else {
// TODO-THUMBNAILS use new ArchiveItem.thumbnailFile that creates a AF for a pseudofile
const self = this; // this not available inside recursable or probably in writable('on)
const thumbnailFiles = !this.files
? []
: this.files.filter(af => af.metadata.name === '__ia_thumb.jpg'
|| af.metadata.name.endsWith('_itemimage.jpg'));
if (thumbnailFiles.length) { // TODO-THUMBNAIL if more than 1, select smallest (or closest to 10k)
// noinspection JSUnusedLocalSymbols
// Loop through files using recursion (list is always short)
// TODO this could probably be replaced by async/until or similar
const recursable = function (err, streamOrUndefined) {
if (err) {
debug(`saveThumbnail: failed in cacheAndOrStream for ${namepart}: %s`, err.message);
if (cb && (thumbnailFiles.length === 0)) { // cb will be undefined if cleared after calling with a stream
cb(err);
return; // Failed as no other files, (and didn't start another stream else cb would be undefined)
}
// Otherwise intentionally drops through after error and tries next file
}
if (wantStream && streamOrUndefined && cb) { // Passed back from first call to cacheOrStream if wantStream is set
cb(null, streamOrUndefined);
cb = undefined;
} // Clear cb so not called when complete
const af = thumbnailFiles.shift();
if (typeof af !== 'undefined') {
af.cacheAndOrStream({
skipFetchFile, noCache, wantStream, copyDirectory
}, recursable); // Recurse
// Exits, allowing recursable to recurse with next iteration
} else { // Completed loop
// cb will be set except in the case of wantStream in which case will have been called with first stream
if (cb) cb(null, self); // Important to cb only after saving, since other file saving might check its SHA and do not want a race condition
}
};
recursable(null, null);
} else { // No existing __ia_thumb.jpg or IDENTIFIER_itemimage.jpg so get from services or thumbnail
// noinspection JSUnresolvedVariable
const urls = `https://archive.org/services/img/${this.identifier}`;
const relFilePath = path.join(this._namepart(), '__ia_thumb.jpg'); // TODO-THUMBNAILS Assumes using __ia_thumb.jpg instead of IDENTIFIER_itemimage.jpg
const debugname = relFilePath;
MirrorFS.cacheAndOrStream({
relFilePath, skipFetchFile, wantStream, noCache, debugname, copyDirectory, urls
},
(err, streamOrUndefined) => {
if (wantStream) traceStream(streamOrUndefined, { func: 'saveThumbnail#679', name: relFilePath });
if (err) {
debug('Unable to cacheOrStream %s', debugname);
cb(err);
} else {
cb(null, wantStream ? streamOrUndefined : this);
}
});
}
}
};
// noinspection JSUnresolvedVariable
ArchiveItem.prototype.fetch_playlist = function ({ wantStream = false, noCache = false, copyDirectory = undefined } = {}, cb) {
/*
Save the related items to the cache, TODO-CACHE-AGING
wantStream true if want stream) alternative is obj. obj will be processed, stream will always be raw (assuming client processes it)
noCache true if want to ignore local cache, noStore not to save result (not currently used)
cb(err, stream|obj) Callback on completion with related items object (can be [])
*/
const { identifier } = this; // Its also in this.metadata.identifier but only if done a fetch_metadata
if (identifier && this.hasPlaylist()) {
// noinspection JSUnresolvedVariable
const relFilePath = path.join(this._namepart(), this._namepart() + '_playlist.json');
// noinspection JSUnresolvedVariable
MirrorFS.cacheAndOrStream({
wantStream,
relFilePath,
noCache,
copyDirectory,
wantBuff: !wantStream, // Explicit because default for cacheAndOrStream if !wantStream is to return undefined
urls: `https://archive.org/embed/${identifier}?output=json`, // Hard coded, would rather have in Util.gateway.url_playlist but complex
debugname: identifier + '/' + identifier + '_playlist.json'
}, (err, res) => {
// Note that if wantStream, then not doing expansion and saving, but in most cases called will expand with next call.
if (!wantStream && !err) {
try {
cb(null, this.processPlaylist(canonicaljson.parse(res)));
} catch (err1) { cb(err1); } // Catch bad JSON
} else {
if (err) debug('fetch_playlist failed for item %s %s', identifier, err.message);
cb(err, res);
}
});
} else {
cb(null, wantStream ? undefined : []);
}
};
// noinspection JSUnresolvedVariable
ArchiveItem.prototype.relatedItems = function ({
wantStream = false, wantMembers = false, noCache = false, copyDirectory = false
} = {}, cb) { // TODO-API noCache
/*
Save the related items to the cache, TODO-CACHE-AGING
wantStream true => cb(err, stream)
wantMembers true => cb(err, [ArchiveMember*] if want ArchiveMember returns, typically false in mirrorHttp as passing back to browser as is.
!wantStream && !wantMembers => cb(err, { hits: hit: [ {}* ] }
cb(err, stream|obj) Callback on completion with related items object (can be [])
*/
const { identifier } = this; // Its also in this.metadata.identifier but only if done a fetch_metadata
if (identifier && !Object.keys(specialidentifiers).includes(identifier)) {
// noinspection JSUnresolvedVariable
const relFilePath = path.join(this._namepart(), this._namepart() + '_related.json');
// noinspection JSUnresolvedVariable
MirrorFS.cacheAndOrStream({
wantStream,
relFilePath,
noCache,
copyDirectory,
wantBuff: !wantStream, // Explicit because default for cacheAndOrStream if !wantStream is to return undefined
urls: 'https://be-api.us.archive.org/mds/v1/get_related/all/' + this.identifier,
debugname: identifier + '/' + identifier + '_related.json'
}, (err, res) => {
// Note that if wantStream, then not doing expansion and saving, but in most cases called will expand with next call.
if (!wantStream && !err) {
try {
const rels = canonicaljson.parse(res);
if (wantMembers) {
// Same code in ArchiveItem.relatedItems
cb(null, rels.hits.hits.map(r => ArchiveMember.fromRel(r)));
} else {
cb(null, rels);
}
} catch (err1) {
debug('ERROR: Bad json in %s', relFilePath);
cb(err1);
} // Catch bad JSON
} else {
cb(err, res); // Could be err or stream
}
});
} else {
cb(null, wantMembers ? [] : undefined);
}
};
ArchiveItem.addCrawlInfoRelated = function (rels, { copyDirectory, config = undefined } = {}, cb) {
/**
* Add .crawlInfo and .downloaded for each result in rels the Related items API
* rels result of RelatedApi i.e. {hits: {hits: [ _id, _source: { FIELDS OF MEMBER }]}}
*/
const hits = rels.hits.hits;
parallel([
cb2 => each(hits,
(hit, cb3) => {
Object.assign(hit._source, { crawl: config.crawlInfo({ identifier: hit._id }) });
cb3(null);
},
cb2),
cb2 => each(hits,
(hit, cb1) => {
new ArchiveItem({ identifier: hit._id }).addDownloadedInfoFiles({ copyDirectory }, (err, ai) => {
if (err) {
// Shouldnt happen since addDownloadedInfoMembers reports and ignores its own errors
debug('addCrawlInfoRelated -> addDownloadedInfoMembers failed for %s in %s: %o', this.identifier, hit._id, err);
} else if (!hit._source.downloaded) {
hit._source.downloaded = ai.downloaded;
} else {
Object.assign(hit._source.downloaded, ai.downloaded);
}
cb1(null); // Do not pass on error
});
}, cb2),
], cb);
};
ArchiveItem.prototype.addDownloadedInfoFiles = function ({ copyDirectory }, cb) {
// Add .downloaded info on all files, and summary on Item
// Note ArchiveItem might not yet have metadata.
waterfall([
// Add info on files if not there already - this can be done in parallel
cb1 => this.fetch_metadata({ skipNet: true, copyDirectory }, cb1),
(unusedThis, cb1) => {
if ((typeof this.downloaded !== 'object') || (this.downloaded === null)) { // Could be undefined (legacy boolean or null as called for each member
this.downloaded = {};
}
if (!Array.isArray(this.files)) {
this.files = [];
}
// Add info on each file
each(this.files, // Could be empty
// relatively inexpensive, as caches result on files.json at final step, only needs to look at disk if uncached
(af, cb2) => af.isDownloaded({ copyDirectory }, cb2), // Should never throw error
cb1);
},
cb1 => { // Add statistical data to item, (note this.files could be empty)
this.summarizeFiles(cb1);
},
cb1 => { // Save file as have changed files info
if (!(this.identifier && this.files.length)) {
cb1(null);
} else {
_save1file('files', this.exportFiles(), this._namepart(), { copyDirectory }, cb1);
}
}
], unusedErr => {
// Done Report error because it could just be because have not downloaded files info via metadata API,
// if (err) debug("Failure in addDownloadedInfoFiles for %s %O", this.identifier, err);
// Also do not block
cb(null, this); // AI is needed for callback in addDownloadedInfoMembers
});
};
/**
*
* @param idealScale calculated scale but real number
* @returns {number} as power of 2
*/
ArchiveItem.prototype.pageQuantizedScale = function (idealScale) {
return [32, 16, 8, 4, 2, 1].find(x => x <= idealScale);
};
/**
* Return an object suitable for passing to fetch_page to check size
* @param pageManifest one page data from manifest (IDENTIFIER_bookreader.json)
* @parm fetchPageOpts {copyDirectory, wantSize, skipNet ...} // Any parms for fetchPage other than in manifestPage (override manifest)
* idealWidth if present is used to calculate the optimum quantized scale (next larger file)
* scale if present is quantized
* currently do not have a use case with both idealWidth and scale specified so undefined which will dominate.
* @returns { parameters for fetch_page }
*/
ArchiveItem.prototype.pageParms = function (pageManifest, fetchPageOpts) {
const url = new URL(pageManifest.uri);
const idealScale = fetchPageOpts.scale || (pageManifest.width / (fetchPageOpts.idealWidth || 800));
return Object.assign({},
{ rotate: 0, // default rotation
// From the url in pageManifest
zip: url.searchParams.get('zip'),
file: url.searchParams.get('file'),
page: url.searchParams.get('page'), // Needed for urls like BookReaderPreview generated for lent out items
},
fetchPageOpts, // Override parameters and add new ones like skipNet
{ scale: this.pageQuantizedScale(idealScale) }); // Use quantizedScale derived above SEE also checkWhereValidFileRotatedScaled
};
ArchiveItem.prototype.addDownloadedInfoPages = function ({ copyDirectory = undefined }, cb) {
// For texts, Add .downloaded info on all pages, and summary on Item
// Note ArchiveItem might not yet have bookreader field loaded when this is called.
// cb(err)
this.fetch_metadata({ skipNet: true, copyDirectory }, (err, ai) => {
if (err || !ai || !ai.metadata || (ai.metadata.mediatype !== 'texts') || (this.subtype() !== 'bookreader')) {
cb(null); // Not a book - do not consider when checking if downloaded
} else {
this.fetch_bookreader({ copyDirectory, skipNet: true }, (err1, unusedAi) => {
if (err1 || !ai.bookreader) {
cb(null); // No book info, presume not downloaded
} else {
if ((typeof this.downloaded !== 'object') || (this.downloaded === null)) this.downloaded = {}; // Could be undefined (legacy boolean or null as called for each member
let cover_t_size = 0;
waterfall([
cb0 => parallel([
cb1 => this.fetch_page({
copyDirectory,
wantSize: true,
page: 'cover_t.jpg',
skipNet: true
}, (err, res) => {
cover_t_size = res;
cb1(err, res);
}), // TODO Do not currently store the cover_t size/downloaded, its minor discrepancy since usually smaller and wont have full download without it anyway
cb1 => each(