forked from matchbox/warc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
listwarccts.py
executable file
·54 lines (43 loc) · 1.52 KB
/
listwarccts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
"""Copyright 2014 Tom Nicholls
Process a directory of Web ARChive files through the warctika library to
reduce binary document formats to plain text 'conversion' records.
This work is available under the terms of the GNU General Purpose Licence
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
"""
#####
#SETUP
#####
from __future__ import print_function
import sys
import os
import warctika
import re
def warning(*objs):
print("WARNING: ", *objs, file=sys.stderr)
if len(sys.argv) < 2:
sys.exit("Must give name of file to list contents of.")
print("Type, URL, Underlying Content-Type, Payload length")
wf = warctika.WARCFile(sys.argv[1], 'rb')
for record in wf:
rtype = record.type
try:
rurl = record.url
except Exception:
rurl = None
try:
rmime = record.get_underlying_mimetype()
except Exception:
rmime = None
rlen = len(record.payload)
print(rtype, rurl, rmime, rlen, sep=", ")
wf.close()