Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to make library compatible with Python3. #26

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions warc/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@
:copyright: (c) 2012 Internet Archive
"""

import __builtin__
import datetime
import os
import re
import StringIO
import warnings

try:
import __builtin__
from StringIO import StringIO
except ImportError:
import builtins as __builtin__
from io import StringIO

from .utils import CaseInsensitiveDict

ARC1_HEADER_RE = re.compile('(?P<url>\S*)\s(?P<ip_address>\S*)\s(?P<date>\S*)\s(?P<content_type>\S*)\s(?P<length>\S*)')
Expand Down Expand Up @@ -135,7 +140,7 @@ def length(self):
return int(self["length"])

def __str__(self):
f = StringIO.StringIO()
f = StringIO()
self.write_to(f)
return f.getvalue()

Expand Down Expand Up @@ -184,6 +189,8 @@ def write_to(self, f, version = None):
f.write("\n") # This separates the header and the body
if isinstance(self.payload, str): #Usually used for small payloads
f.write(self.payload)
elif isinstance(self.payload, bytes):
f.write(self.payload.decode('utf-8'))
elif hasattr(self.payload, "read"): #Used for large payloads where we give a file like object
chunk_size = 10 * 1024 * 1024 # Read 10MB by 10MB
d = self.payload.read(chunk_size)
Expand All @@ -200,7 +207,7 @@ def __setitem__(self, name, value):


def __str__(self):
f = StringIO.StringIO()
f = StringIO()
self.write_to(f)
return f.getvalue()

Expand Down Expand Up @@ -318,15 +325,19 @@ def _read_file_header(self):
# print "--------------------------------------------------"
if self.version and int(self.version) != version:
raise IOError("Version mismatch. Requested version was '%s' but version in file was '%s'"%(self.version, version))
if version == '1':

if int(version) == 1:
url, ip_address, date, content_type, length = header.split()
if isinstance(date, bytes):
date = date.decode('utf-8')
self.file_headers = {"ip_address" : ip_address,
"date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"),
"org" : organisation}
self.version = 1
elif version == '2':
elif int(version) == 2:
url, ip_address, date, content_type, result_code, checksum, location, offset, filename, length = header.split()
if isinstance(date, bytes):
date = date.decode('utf-8')
self.file_headers = {"ip_address" : ip_address,
"date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"),
"org" : organisation}
Expand Down Expand Up @@ -355,6 +366,8 @@ def _read_arc_record(self):
elif int(self.version) == 2:
arc_header_re = ARC2_HEADER_RE

if isinstance(header, bytes):
header = header.decode('utf-8')
matches = arc_header_re.search(header)
headers = matches.groupdict()
arc_header = ARCHeader(**headers)
Expand Down
13 changes: 8 additions & 5 deletions warc/gzip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,19 @@ class GzipFile(BaseGzipFile):
"""
def __init__(self, filename=None, mode=None,
compresslevel=9, fileobj=None):
BaseGzipFile.__init__(self,
filename=filename,
BaseGzipFile.__init__(self,
filename=filename,
mode=mode,
compresslevel=compresslevel,
fileobj=fileobj)

if self.mode == WRITE:
# Indicates the start of a new member if value is True.
# The BaseGzipFile constructor already wrote the header for new
# member, so marking as False.
self._new_member = False

if not hasattr(self, '_new_member'):
self._new_member = True
# When _member_lock is True, only one member in gzip file is read
self._member_lock = False

Expand All @@ -49,7 +50,7 @@ def close_member(self):
self.fileobj.write(self.compress.flush())
write32u(self.fileobj, self.crc)
# self.size may exceed 2GB, or even 4GB
write32u(self.fileobj, self.size & 0xffffffffL)
write32u(self.fileobj, self.size & 0xffffffff)
self.size = 0
self.compress = zlib.compressobj(9,
zlib.DEFLATED,
Expand Down Expand Up @@ -95,6 +96,8 @@ def _read(self, size):
def read_member(self):
"""Returns a file-like object to read one member from the gzip file.
"""
if hasattr(self, '_buffer'):
return self._buffer
if self._member_lock is False:
self._member_lock = True

Expand Down
33 changes: 18 additions & 15 deletions warc/tests/test_arc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import datetime
import hashlib
import StringIO
try:
from StringIO import StringIO
except ImportError:
from io import StringIO

from .. import arc

Expand Down Expand Up @@ -49,7 +52,7 @@ def test_arc_v1_header_creation():
location = "http://www.archive.org",
offset = "300",
filename = "sample.arc.gz")
f = StringIO.StringIO()
f = StringIO()
header.write_to(f, 1)
header_v1_string = f.getvalue()
assert header_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500"
Expand All @@ -67,7 +70,7 @@ def test_arc_v2_header_creation():
location = "http://www.archive.org",
offset = "300",
filename = "sample.arc.gz")
f = StringIO.StringIO()
f = StringIO()
header.write_to(f)
header_v2_string = f.getvalue()
assert header_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500"
Expand All @@ -86,7 +89,7 @@ def test_arc_v1_record_creation():
offset = "300",
filename = "sample.arc.gz")
record_v1 = arc.ARCRecord(header, "BlahBlah")
f = StringIO.StringIO()
f = StringIO()
record_v1.write_to(f, 1)
record_v1_string = f.getvalue()
assert record_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n"
Expand All @@ -104,7 +107,7 @@ def test_arc_v2_record_creation():
offset = "300",
filename = "sample.arc.gz")
record_v2 = arc.ARCRecord(payload = "BlahBlah", headers = header)
f = StringIO.StringIO()
f = StringIO()
record_v2.write_to(f)
record_v2_string = f.getvalue()
assert record_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n"
Expand All @@ -116,7 +119,7 @@ def test_arc_v1_writer():
date = now,
org = "Internet Archive")

opfile = StringIO.StringIO()
opfile = StringIO()
opfile.name = "sample.arc" # Necessary since only file objects in Python have names.

f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers)
Expand All @@ -137,7 +140,7 @@ def test_arc1_v1_writer_default_headers():
now = datetime.datetime(year = 2012, month = 3, day = 2, hour = 19, minute = 32, second = 10)
file_headers = dict(date = now)

opfile = StringIO.StringIO()
opfile = StringIO()
opfile.name = "sample.arc" # Necessary since only file objects in Python have names.

f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers)
Expand All @@ -161,7 +164,7 @@ def test_arc_v2_writer():
date = now,
org = "Internet Archive")

opfile = StringIO.StringIO()
opfile = StringIO()
opfile.name = "sample.arc" # Necessary since only file objects in Python have names.

f = arc.ARCFile(fileobj = opfile, file_headers = file_headers)
Expand All @@ -183,8 +186,8 @@ def test_arc_v2_writer():

def test_arc_reader_guess_version():
"Make sure that the ARCFile object automatically detects the file version"
v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2")
v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")
v1 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2")
v2 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")

arc_v1 = arc.ARCFile(fileobj = v1)
arc_v2 = arc.ARCFile(fileobj = v2)
Expand All @@ -197,7 +200,7 @@ def test_arc_reader_guess_version():

def test_arc_reader_read_file_headers():
"Make sure that the parser is reading file headers properly"
ip = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")
ip = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")
arc_file = arc.ARCFile(fileobj = ip)
arc_file.read()
arc_file.file_headers['ip_address'] == "127.0.0.1"
Expand All @@ -207,7 +210,7 @@ def test_arc_reader_read_file_headers():

def test_arc_reader_v1():
"Make sure that the parser reads out V1 ARC records. (Also tests iterator behaviour)"
v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2")
v1 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2")
arc_file = arc.ARCFile(fileobj = v1)

r1 = arc_file.read()
Expand All @@ -230,7 +233,7 @@ def test_arc_reader_v1():

def test_arc_reader_v2():
"Make sure that the parser reads out V2 ARC records. (Also tests iterator behaviour)"
v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2")
v2 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2")
arc_file = arc.ARCFile(fileobj = v2)
r1, r2 = list(arc_file)

Expand Down Expand Up @@ -288,12 +291,12 @@ def test_arc_record_versions():
filename = "sample.arc.gz")
record_1 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 1)
record_2 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 2)
f = StringIO.StringIO()
f = StringIO()
record_1.write_to(f)
record_string = f.getvalue()
assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n"

f = StringIO.StringIO()
f = StringIO()
record_2.write_to(f)
record_string = f.getvalue()
assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n"
Expand Down
4 changes: 0 additions & 4 deletions warc/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,3 @@ def test_sample_data():
expected = """http://www.killerjo.net:80/robots.txt 211.111.217.29 20110804181142 39
SSH-2.0-OpenSSH_5.3p1 Debian-3ubuntu3\r\n\n"""
assert record == expected




7 changes: 5 additions & 2 deletions warc/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from ..utils import FilePart, CaseInsensitiveDict
from cStringIO import StringIO
try:
from cStringIO import StringIO
except ImportError:
from io import StringIO

class TestCaseInsensitiveDict:
def test_all(self):
Expand Down Expand Up @@ -53,4 +56,4 @@ def test_readline(self):

def test_iter(self):
part = FilePart(StringIO(self.text), 11)
assert list(part) == ["aaaa\n", "bbbb\n", "c"]
assert list(part) == ["aaaa\n", "bbbb\n", "c"]
14 changes: 9 additions & 5 deletions warc/tests/test_warc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile

from StringIO import StringIO
try:
from cStringIO import StringIO
except ImportError:
from io import StringIO

class TestWARCHeader:
def test_attrs(self):
Expand Down Expand Up @@ -99,13 +102,14 @@ def test_read(self):

def test_write_gz(self):
"""Test writing multiple member gzip file."""
buffer = StringIO()
f = WARCFile(fileobj=buffer, mode="w", compress=True)
from io import BytesIO
buffer = BytesIO()
f = WARCFile(fileobj=buffer, mode="wb", compress=True)
for i in range(10):
record = WARCRecord(payload="hello %d" % i)
record = WARCRecord(payload=b"hello %d" % i)
f.write_record(record)

GZIP_MAGIC_NUMBER = '\037\213'
GZIP_MAGIC_NUMBER = b'\037\213'
assert buffer.getvalue().count(GZIP_MAGIC_NUMBER) == 10

def test_long_header(self):
Expand Down
18 changes: 14 additions & 4 deletions warc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
:copyright: (c) 2012 Internet Archive
"""

from UserDict import DictMixin
try:
from UserDict import DictMixin
except ImportError:
from collections import MutableMapping as DictMixin

class CaseInsensitiveDict(DictMixin):
"""Almost like a dictionary, but keys are case-insensitive.
Expand All @@ -23,9 +26,9 @@ class CaseInsensitiveDict(DictMixin):
>>> d.keys()
["foo", "bar"]
"""
def __init__(self, mapping=None, **kwargs):
def __init__(self, *args, **kwargs):
self._d = {}
self.update(mapping, **kwargs)
self.update(*args, **kwargs)

def __setitem__(self, name, value):
self._d[name.lower()] = value
Expand All @@ -38,7 +41,14 @@ def __delitem__(self, name):

def __eq__(self, other):
return isinstance(other, CaseInsensitiveDict) and other._d == self._d


def __len__(self):
return len(self._d)

def __iter__(self):
for i in self._d:
yield i

def keys(self):
return self._d.keys()

Expand Down
Loading