# coding: utf-8 from __future__ import unicode_literals, absolute_import import re import textwrap import logging log = logging.getLogger(__name__) class LineBuffer(object): r""" Buffer bytes read in from a connection and serve complete lines back. >>> b = LineBuffer() >>> len(b) 0 >>> b.feed(b'foo\nbar') >>> len(b) 7 >>> list(b.lines()) == [b'foo'] True >>> len(b) 3 >>> b.feed(b'bar\r\nbaz\n') >>> list(b.lines()) == [b'barbar', b'baz'] True >>> len(b) 0 The buffer will not perform any decoding. >>> b.feed(b'Ol\xe9\n') >>> list(b.lines()) == [b'Ol\xe9'] True The LineBuffer should also act as an iterable. >>> b.feed(b'iterate\nthis\n') >>> for line, expected in zip(b, [b'iterate', b'this']): ... assert line == expected """ line_sep_exp = re.compile(b'\r?\n') def __init__(self): self.buffer = b'' def feed(self, bytes): self.buffer += bytes def lines(self): lines = self.line_sep_exp.split(self.buffer) # save the last, unfinished, possibly empty line self.buffer = lines.pop() return iter(lines) def __iter__(self): return self.lines() def __len__(self): return len(self.buffer) class DecodingLineBuffer(LineBuffer): r""" Like LineBuffer, but decode the output (default assumes UTF-8). >>> utf8_word = b'Ol\xc3\xa9' >>> b = DecodingLineBuffer() >>> b.feed(b'bar\r\nbaz\n' + utf8_word + b'\n') >>> list(b.lines()) ['bar', 'baz', 'Ol\xe9'] >>> len(b) 0 Some clients will feed latin-1 or other encodings. If your client should support docoding from these clients (and not raise a UnicodeDecodeError), set errors='replace': >>> b = DecodingLineBuffer() >>> b.errors = 'replace' >>> b.feed(b'Ol\xe9\n') >>> list(b.lines()) == ['Ol\ufffd'] True >>> b = DecodingLineBuffer() >>> b.feed(b'Ol\xe9\n') >>> list(b.lines()) Traceback (most recent call last): ... UnicodeDecodeError: ... """ encoding = 'utf-8' errors = 'strict' def lines(self): for line in super(DecodingLineBuffer, self).lines(): try: yield line.decode(self.encoding, self.errors) except UnicodeDecodeError: self.handle_exception() def handle_exception(self): msg = textwrap.dedent(""" Unknown encoding encountered. See 'Decoding Input' in https://pypi.python.org/pypi/irc for details. """) log.warning(msg) raise class LenientDecodingLineBuffer(LineBuffer): r""" Like LineBuffer, but decode the output. First try UTF-8 and if that fails, use latin-1, which decodes all byte strings. >>> b = LenientDecodingLineBuffer() >>> utf8_word = b'Ol\xc3\xa9' >>> b.feed(utf8_word + b'\n') >>> b.feed(b'Ol\xe9\n') >>> list(b.lines()) ['Ol\xe9', 'Ol\xe9'] """ def lines(self): for line in super(LenientDecodingLineBuffer, self).lines(): try: yield line.decode('utf-8', 'strict') except UnicodeDecodeError: yield line.decode('latin-1')