Checkpoint so I can continue to work on this at a different box.
There is somewhat working (but slow) code supporting seek/tell for text files, but extensive testing exposes a bug I can't nail down.
This commit is contained in:
parent
8742977b33
commit
9b76da6a8f
170
Lib/io.py
170
Lib/io.py
@ -13,8 +13,9 @@ variable are part of the specification.
|
||||
|
||||
XXX need to default buffer size to 1 if isatty()
|
||||
XXX need to support 1 meaning line-buffered
|
||||
XXX change behavior of blocking I/O
|
||||
XXX don't use assert to validate input requirements
|
||||
XXX whenever an argument is None, use the default value
|
||||
XXX read/write ops should check readable/writable
|
||||
"""
|
||||
|
||||
__author__ = ("Guido van Rossum <guido@python.org>, "
|
||||
@ -29,9 +30,11 @@ __all__ = ["BlockingIOError", "open", "IOBase", "RawIOBase", "FileIO",
|
||||
import os
|
||||
import sys
|
||||
import codecs
|
||||
import pickle
|
||||
import _fileio
|
||||
import warnings
|
||||
|
||||
# XXX Shouldn't we use st_blksize whenever we can?
|
||||
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
|
||||
|
||||
|
||||
@ -44,18 +47,22 @@ class BlockingIOError(IOError):
|
||||
self.characters_written = characters_written
|
||||
|
||||
|
||||
def open(file, mode="r", buffering=None, *, encoding=None):
|
||||
def open(file, mode="r", buffering=None, *, encoding=None, newline=None):
|
||||
"""Replacement for the built-in open function.
|
||||
|
||||
Args:
|
||||
file: string giving the name of the file to be opened;
|
||||
or integer file descriptor of the file to be wrapped (*)
|
||||
mode: optional mode string; see below
|
||||
or integer file descriptor of the file to be wrapped (*).
|
||||
mode: optional mode string; see below.
|
||||
buffering: optional int >= 0 giving the buffer size; values
|
||||
can be: 0 = unbuffered, 1 = line buffered,
|
||||
larger = fully buffered
|
||||
encoding: optional string giving the text encoding (*must* be given
|
||||
as a keyword argument)
|
||||
larger = fully buffered.
|
||||
Keywords (for text modes only; *must* be given as keyword arguments):
|
||||
encoding: optional string giving the text encoding.
|
||||
newline: optional newlines specifier; must be None, '\n' or '\r\n';
|
||||
specifies the line ending expected on input and written on
|
||||
output. If None, use universal newlines on input and
|
||||
use os.linesep on output.
|
||||
|
||||
(*) If a file descriptor is given, it is closed when the returned
|
||||
I/O object is closed. If you don't want this to happen, use
|
||||
@ -79,6 +86,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
|
||||
binary stream, a buffered binary stream, or a buffered text
|
||||
stream, open for reading and/or writing.
|
||||
"""
|
||||
# XXX Don't use asserts for these checks; raise TypeError or ValueError
|
||||
assert isinstance(file, (basestring, int)), repr(file)
|
||||
assert isinstance(mode, basestring), repr(mode)
|
||||
assert buffering is None or isinstance(buffering, int), repr(buffering)
|
||||
@ -101,7 +109,9 @@ def open(file, mode="r", buffering=None, *, encoding=None):
|
||||
if not (reading or writing or appending):
|
||||
raise ValueError("must have exactly one of read/write/append mode")
|
||||
if binary and encoding is not None:
|
||||
raise ValueError("binary mode doesn't take an encoding")
|
||||
raise ValueError("binary mode doesn't take an encoding argument")
|
||||
if binary and newline is not None:
|
||||
raise ValueError("binary mode doesn't take a newline argument")
|
||||
raw = FileIO(file,
|
||||
(reading and "r" or "") +
|
||||
(writing and "w" or "") +
|
||||
@ -132,9 +142,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
|
||||
buffer = BufferedReader(raw, buffering)
|
||||
if binary:
|
||||
return buffer
|
||||
# XXX What about newline conventions?
|
||||
textio = TextIOWrapper(buffer, encoding)
|
||||
return textio
|
||||
return TextIOWrapper(buffer, encoding, newline)
|
||||
|
||||
|
||||
class IOBase:
|
||||
@ -795,6 +803,8 @@ class TextIOBase(IOBase):
|
||||
"""Base class for text I/O.
|
||||
|
||||
This class provides a character and line based interface to stream I/O.
|
||||
|
||||
There is no readinto() method, as character strings are immutable.
|
||||
"""
|
||||
|
||||
def read(self, n: int = -1) -> str:
|
||||
@ -805,10 +815,18 @@ class TextIOBase(IOBase):
|
||||
"""
|
||||
self._unsupported("read")
|
||||
|
||||
def write(self, s: str):
|
||||
"""write(s: str) -> None. Write string s to stream."""
|
||||
def write(self, s: str) -> int:
|
||||
"""write(s: str) -> int. Write string s to stream."""
|
||||
self._unsupported("write")
|
||||
|
||||
def truncate(self, pos: int = None) -> int:
|
||||
"""truncate(pos: int = None) -> int. Truncate size to pos."""
|
||||
self.flush()
|
||||
if pos is None:
|
||||
pos = self.tell()
|
||||
self.seek(pos)
|
||||
return self.buffer.truncate()
|
||||
|
||||
def readline(self) -> str:
|
||||
"""readline() -> str. Read until newline or EOF.
|
||||
|
||||
@ -816,12 +834,12 @@ class TextIOBase(IOBase):
|
||||
"""
|
||||
self._unsupported("readline")
|
||||
|
||||
def __iter__(self):
|
||||
def __iter__(self) -> "TextIOBase": # That's a forward reference
|
||||
"""__iter__() -> Iterator. Return line iterator (actually just self).
|
||||
"""
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
def next(self) -> str:
|
||||
"""Same as readline() except raises StopIteration on immediate EOF."""
|
||||
line = self.readline()
|
||||
if not line:
|
||||
@ -855,11 +873,11 @@ class TextIOWrapper(TextIOBase):
|
||||
Character and line based layer over a BufferedIOBase object.
|
||||
"""
|
||||
|
||||
# XXX tell(), seek()
|
||||
_CHUNK_SIZE = 64
|
||||
|
||||
def __init__(self, buffer, encoding=None, newline=None):
|
||||
if newline not in (None, '\n', '\r\n'):
|
||||
raise IOError("illegal newline %s" % newline) # XXX: ValueError?
|
||||
if newline not in (None, "\n", "\r\n"):
|
||||
raise ValueError("illegal newline value: %r" % (newline,))
|
||||
if encoding is None:
|
||||
# XXX This is questionable
|
||||
encoding = sys.getfilesystemencoding() or "latin-1"
|
||||
@ -869,7 +887,20 @@ class TextIOWrapper(TextIOBase):
|
||||
self._newline = newline or os.linesep
|
||||
self._fix_newlines = newline is None
|
||||
self._decoder = None
|
||||
self._pending = ''
|
||||
self._decoder_in_rest_pickle = None
|
||||
self._pending = ""
|
||||
self._snapshot = None
|
||||
self._seekable = self.buffer.seekable()
|
||||
|
||||
# A word about _snapshot. This attribute is either None, or a
|
||||
# tuple (position, decoder_pickle, readahead) where position is a
|
||||
# position of the underlying buffer, decoder_pickle is a pickled
|
||||
# decoder state, and readahead is the chunk of bytes that was read
|
||||
# from that position. We use this to reconstruct intermediate
|
||||
# decoder states in tell().
|
||||
|
||||
def _seekable(self):
|
||||
return self._seekable
|
||||
|
||||
def flush(self):
|
||||
self.buffer.flush()
|
||||
@ -886,35 +917,124 @@ class TextIOWrapper(TextIOBase):
|
||||
return self.buffer.fileno()
|
||||
|
||||
def write(self, s: str):
|
||||
# XXX What if we were just reading?
|
||||
b = s.encode(self._encoding)
|
||||
if isinstance(b, str):
|
||||
b = bytes(b)
|
||||
n = self.buffer.write(b)
|
||||
if "\n" in s:
|
||||
self.flush()
|
||||
return n
|
||||
self._snapshot = self._decoder = None
|
||||
return len(s)
|
||||
|
||||
def _get_decoder(self):
|
||||
make_decoder = codecs.getincrementaldecoder(self._encoding)
|
||||
if make_decoder is None:
|
||||
raise IOError(".readline() not supported for encoding %s" %
|
||||
raise IOError("Can't find an incremental decoder for encoding %s" %
|
||||
self._encoding)
|
||||
decoder = self._decoder = make_decoder() # XXX: errors
|
||||
if isinstance(decoder, codecs.BufferedIncrementalDecoder):
|
||||
# XXX Hack: make the codec use bytes instead of strings
|
||||
decoder.buffer = b""
|
||||
self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell()
|
||||
return decoder
|
||||
|
||||
def _read_chunk(self):
|
||||
if not self._seekable:
|
||||
return self.buffer.read(self._CHUNK_SIZE)
|
||||
assert self._decoder is not None
|
||||
position = self.buffer.tell()
|
||||
decoder_state = pickle.dumps(self._decoder, 2)
|
||||
readahead = self.buffer.read(self._CHUNK_SIZE)
|
||||
self._snapshot = (position, decoder_state, readahead)
|
||||
return readahead
|
||||
|
||||
def _encode_decoder_state(self, ds, pos):
|
||||
if ds == self._decoder_in_rest_pickle:
|
||||
return pos
|
||||
x = 0
|
||||
for i in bytes(ds):
|
||||
x = x<<8 | i
|
||||
return (x<<64) | pos
|
||||
|
||||
def _decode_decoder_state(self, pos):
|
||||
x, pos = divmod(pos, 1<<64)
|
||||
if not x:
|
||||
return None, pos
|
||||
b = b""
|
||||
while x:
|
||||
b.append(x&0xff)
|
||||
x >>= 8
|
||||
return str(b[::-1]), pos
|
||||
|
||||
def tell(self):
|
||||
if not self._seekable:
|
||||
raise IOError("Underlying stream is not seekable")
|
||||
self.flush()
|
||||
if self._decoder is None or self._snapshot is None:
|
||||
assert self._pending == ""
|
||||
return self.buffer.tell()
|
||||
position, decoder_state, readahead = self._snapshot
|
||||
decoder = pickle.loads(decoder_state)
|
||||
characters = ""
|
||||
sequence = []
|
||||
for i, b in enumerate(readahead):
|
||||
c = decoder.decode(bytes([b]))
|
||||
if c:
|
||||
characters += c
|
||||
sequence.append((characters, i+1, pickle.dumps(decoder, 2)))
|
||||
for ch, i, st in sequence:
|
||||
if ch + self._pending == characters:
|
||||
return self._encode_decoder_state(st, position + i)
|
||||
raise IOError("Can't reconstruct logical file position")
|
||||
|
||||
def seek(self, pos, whence=0):
|
||||
if not self._seekable:
|
||||
raise IOError("Underlying stream is not seekable")
|
||||
if whence == 1:
|
||||
if pos != 0:
|
||||
raise IOError("Can't do nonzero cur-relative seeks")
|
||||
return self.tell()
|
||||
if whence == 2:
|
||||
if pos != 0:
|
||||
raise IOError("Can't do nonzero end-relative seeks")
|
||||
self.flush()
|
||||
pos = self.buffer.seek(0, 2)
|
||||
self._snapshot = None
|
||||
self._pending = ""
|
||||
self._decoder = None
|
||||
return pos
|
||||
if whence != 0:
|
||||
raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
|
||||
(whence,))
|
||||
if pos < 0:
|
||||
raise ValueError("Negative seek position %r" % (pos,))
|
||||
orig_pos = pos
|
||||
ds, pos = self._decode_decoder_state(pos)
|
||||
if not ds:
|
||||
self.buffer.seek(pos)
|
||||
self._snapshot = None
|
||||
self._pending = ""
|
||||
self._decoder = None
|
||||
return pos
|
||||
decoder = pickle.loads(ds)
|
||||
self.buffer.seek(pos)
|
||||
self._snapshot = (pos, ds, "")
|
||||
self._pending = ""
|
||||
self._decoder = None
|
||||
return orig_pos
|
||||
|
||||
def read(self, n: int = -1):
|
||||
decoder = self._decoder or self._get_decoder()
|
||||
res = self._pending
|
||||
if n < 0:
|
||||
res += decoder.decode(self.buffer.read(), True)
|
||||
self._pending = ""
|
||||
self._snapshot = None
|
||||
return res
|
||||
else:
|
||||
while len(res) < n:
|
||||
data = self.buffer.read(64)
|
||||
data = self._read_chunk()
|
||||
res += decoder.decode(data, not data)
|
||||
if not data:
|
||||
break
|
||||
@ -923,7 +1043,7 @@ class TextIOWrapper(TextIOBase):
|
||||
|
||||
def readline(self, limit=None):
|
||||
if limit is not None:
|
||||
# XXX Hack to support limit arg
|
||||
# XXX Hack to support limit argument, for backwards compatibility
|
||||
line = self.readline()
|
||||
if len(line) <= limit:
|
||||
return line
|
||||
@ -951,7 +1071,7 @@ class TextIOWrapper(TextIOBase):
|
||||
|
||||
# We've seen \r - is it standalone, \r\n or \r at end of line?
|
||||
if endpos + 1 < len(line):
|
||||
if line[endpos+1] == '\n':
|
||||
if line[endpos+1] == "\n":
|
||||
ending = "\r\n"
|
||||
else:
|
||||
ending = "\r"
|
||||
@ -963,7 +1083,7 @@ class TextIOWrapper(TextIOBase):
|
||||
|
||||
# No line ending seen yet - get more data
|
||||
while True:
|
||||
data = self.buffer.read(64)
|
||||
data = self._read_chunk()
|
||||
more_line = decoder.decode(data, not data)
|
||||
if more_line or not data:
|
||||
break
|
||||
|
@ -93,6 +93,32 @@ class IOTest(unittest.TestCase):
|
||||
self.assertEqual(f.truncate(12), 12)
|
||||
self.assertEqual(f.tell(), 12)
|
||||
|
||||
def read_ops(self, f, buffered=False):
|
||||
data = f.read(5)
|
||||
self.assertEqual(data, b"hello")
|
||||
self.assertEqual(f.readinto(data), 5)
|
||||
self.assertEqual(data, b" worl")
|
||||
self.assertEqual(f.readinto(data), 2)
|
||||
self.assertEqual(len(data), 5)
|
||||
self.assertEqual(data[:2], b"d\n")
|
||||
self.assertEqual(f.seek(0), 0)
|
||||
self.assertEqual(f.read(20), b"hello world\n")
|
||||
self.assertEqual(f.read(1), b"")
|
||||
self.assertEqual(f.readinto(b"x"), 0)
|
||||
self.assertEqual(f.seek(-6, 2), 6)
|
||||
self.assertEqual(f.read(5), b"world")
|
||||
self.assertEqual(f.read(0), b"")
|
||||
self.assertEqual(f.readinto(b""), 0)
|
||||
self.assertEqual(f.seek(-6, 1), 5)
|
||||
self.assertEqual(f.read(5), b" worl")
|
||||
self.assertEqual(f.tell(), 10)
|
||||
if buffered:
|
||||
f.seek(0)
|
||||
self.assertEqual(f.read(), b"hello world\n")
|
||||
f.seek(6)
|
||||
self.assertEqual(f.read(), b"world\n")
|
||||
self.assertEqual(f.read(), b"")
|
||||
|
||||
LARGE = 2**31
|
||||
|
||||
def large_file_ops(self, f):
|
||||
@ -112,24 +138,6 @@ class IOTest(unittest.TestCase):
|
||||
self.assertEqual(f.seek(-1, 2), self.LARGE)
|
||||
self.assertEqual(f.read(2), b"x")
|
||||
|
||||
def read_ops(self, f):
|
||||
data = f.read(5)
|
||||
self.assertEqual(data, b"hello")
|
||||
n = f.readinto(data)
|
||||
self.assertEqual(n, 5)
|
||||
self.assertEqual(data, b" worl")
|
||||
n = f.readinto(data)
|
||||
self.assertEqual(n, 2)
|
||||
self.assertEqual(len(data), 5)
|
||||
self.assertEqual(data[:2], b"d\n")
|
||||
f.seek(0)
|
||||
self.assertEqual(f.read(20), b"hello world\n")
|
||||
f.seek(-6, 2)
|
||||
self.assertEqual(f.read(5), b"world")
|
||||
f.seek(-6, 1)
|
||||
self.assertEqual(f.read(5), b" worl")
|
||||
self.assertEqual(f.tell(), 10)
|
||||
|
||||
def test_raw_file_io(self):
|
||||
f = io.open(test_support.TESTFN, "wb", buffering=0)
|
||||
self.assertEqual(f.readable(), False)
|
||||
@ -155,7 +163,7 @@ class IOTest(unittest.TestCase):
|
||||
self.assertEqual(f.readable(), True)
|
||||
self.assertEqual(f.writable(), False)
|
||||
self.assertEqual(f.seekable(), True)
|
||||
self.read_ops(f)
|
||||
self.read_ops(f, True)
|
||||
f.close()
|
||||
|
||||
def test_raw_bytes_io(self):
|
||||
@ -164,7 +172,7 @@ class IOTest(unittest.TestCase):
|
||||
data = f.getvalue()
|
||||
self.assertEqual(data, b"hello world\n")
|
||||
f = io.BytesIO(data)
|
||||
self.read_ops(f)
|
||||
self.read_ops(f, True)
|
||||
|
||||
def test_large_file_ops(self):
|
||||
# On Windows and Mac OSX this test comsumes large resources; It takes
|
||||
@ -445,6 +453,10 @@ class BufferedRandomTest(unittest.TestCase):
|
||||
|
||||
|
||||
class TextIOWrapperTest(unittest.TestCase):
|
||||
|
||||
## def tearDown(self):
|
||||
## test_support.unlink(test_support.TESTFN)
|
||||
|
||||
def testNewlines(self):
|
||||
input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
|
||||
|
||||
@ -486,6 +498,62 @@ class TextIOWrapperTest(unittest.TestCase):
|
||||
self.assertEquals(got_line, exp_line)
|
||||
self.assertEquals(len(got_lines), len(exp_lines))
|
||||
|
||||
# Systematic tests of the text I/O API
|
||||
|
||||
def testBasicIO(self):
|
||||
for chunksize in (1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65):
|
||||
for enc in "ascii", "latin1", "utf8" :# , "utf-16-be", "utf-16-le":
|
||||
f = io.open(test_support.TESTFN, "w+", encoding=enc)
|
||||
f._CHUNK_SIZE = chunksize
|
||||
self.assertEquals(f.write("abc"), 3)
|
||||
f.close()
|
||||
f = io.open(test_support.TESTFN, "r+", encoding=enc)
|
||||
f._CHUNK_SIZE = chunksize
|
||||
self.assertEquals(f.tell(), 0)
|
||||
self.assertEquals(f.read(), "abc")
|
||||
cookie = f.tell()
|
||||
self.assertEquals(f.seek(0), 0)
|
||||
self.assertEquals(f.read(2), "ab")
|
||||
self.assertEquals(f.read(1), "c")
|
||||
self.assertEquals(f.read(1), "")
|
||||
self.assertEquals(f.read(), "")
|
||||
self.assertEquals(f.tell(), cookie)
|
||||
self.assertEquals(f.seek(0), 0)
|
||||
self.assertEquals(f.seek(0, 2), cookie)
|
||||
self.assertEquals(f.write("def"), 3)
|
||||
self.assertEquals(f.seek(cookie), cookie)
|
||||
self.assertEquals(f.read(), "def")
|
||||
if enc.startswith("utf"):
|
||||
self.multi_line_test(f, enc)
|
||||
f.close()
|
||||
|
||||
def multi_line_test(self, f, enc):
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
sample = u"s\xff\u0fff\uffff"
|
||||
wlines = []
|
||||
for size in (0, 1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65,
|
||||
100, 200, 300, 400, 500, 1000):
|
||||
chars = []
|
||||
for i in xrange(size):
|
||||
chars.append(sample[i % len(sample)])
|
||||
line = u"".join(chars) + "\n"
|
||||
wlines.append((f.tell(), line))
|
||||
f.write(line)
|
||||
wendpos = f.tell()
|
||||
f.seek(0)
|
||||
rlines = []
|
||||
while True:
|
||||
pos = f.tell()
|
||||
line = f.readline()
|
||||
if not line:
|
||||
rendpos = pos
|
||||
break
|
||||
rlines.append((pos, line))
|
||||
self.assertEquals(rendpos, wendpos)
|
||||
self.assertEquals(rlines, wlines)
|
||||
|
||||
|
||||
# XXX Tests for open()
|
||||
|
||||
def test_main():
|
||||
|
Loading…
x
Reference in New Issue
Block a user