From 90cc0e5c6bbceb90628341b3e3b725798c90be93 Mon Sep 17 00:00:00 2001 From: Ben Homnick Date: Thu, 17 Oct 2013 21:47:44 +0800 Subject: [PATCH] log warning on bad header read --- warc/tests/test_warc.py | 2 ++ warc/warc.py | 11 +++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 029a899..1d88904 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -59,6 +59,7 @@ def f(type): "Content-Type: application/http; msgtype=response\r\n" + "P3P: policyref=\"http://www.w3.org/2001/05/P3P/p3p.xml\"\r\n" + "Page.Ly: v4.1\r\n" + + "BadHeader: \n" + "WARC-Type: response\r\n" + "WARC-Record-ID: \r\n" + "WARC-Target-URI: http://example.com/\r\n" + @@ -75,6 +76,7 @@ def test_read_header1(self): assert h.record_id == "" assert h.type == "response" assert h.content_length == 10 + assert 'BadHeader' not in h def test_empty(self): reader = WARCReader(StringIO("")) diff --git a/warc/warc.py b/warc/warc.py index 2908006..c58ea1c 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -18,6 +18,8 @@ from . import gzip2 from .utils import CaseInsensitiveDict, FilePart +logger = logging.getLogger(__name__) + class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. @@ -340,10 +342,11 @@ def read_header(self, fileobj): if line == "\r\n": # end of headers break m = self.RE_HEADER.match(line) - if not m: - raise IOError("Bad header line: %r" % line) - name, value = m.groups() - headers[name] = value + if m: + name, value = m.groups() + headers[name] = value + else: + logger.warning("Bad header line: %r" % line) return WARCHeader(headers) def expect(self, fileobj, expected_line, message=None):