Skip to content

Commit d7a0ad7

Browse files
bpo-34010: Fix tarfile read performance regression (GH-8020)
During buffered read, use a list followed by join instead of extending a bytes object. This is how it was done before but changed in commit b506dc3. (cherry picked from commit 12a08c4) Co-authored-by: hajoscher <[email protected]>
1 parent de6a2de commit d7a0ad7

File tree

2 files changed

+13
-9
lines changed

2 files changed

+13
-9
lines changed

‎Lib/tarfile.py‎

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ def read(self, size=None):
534534
if not buf:
535535
break
536536
t.append(buf)
537-
buf = "".join(t)
537+
buf = b"".join(t)
538538
else:
539539
buf = self._read(size)
540540
self.pos += len(buf)
@@ -547,6 +547,7 @@ def _read(self, size):
547547
return self.__read(size)
548548

549549
c = len(self.dbuf)
550+
t = [self.dbuf]
550551
while c < size:
551552
buf = self.__read(self.bufsize)
552553
if not buf:
@@ -555,26 +556,27 @@ def _read(self, size):
555556
buf = self.cmp.decompress(buf)
556557
except self.exception:
557558
raise ReadError("invalid compressed data")
558-
self.dbuf += buf
559+
t.append(buf)
559560
c += len(buf)
560-
buf = self.dbuf[:size]
561-
self.dbuf = self.dbuf[size:]
562-
return buf
561+
t = b"".join(t)
562+
self.dbuf = t[size:]
563+
return t[:size]
563564

564565
def __read(self, size):
565566
"""Return size bytes from stream. If internal buffer is empty,
566567
read another block from the stream.
567568
"""
568569
c = len(self.buf)
570+
t = [self.buf]
569571
while c < size:
570572
buf = self.fileobj.read(self.bufsize)
571573
if not buf:
572574
break
573-
self.buf += buf
575+
t.append(buf)
574576
c += len(buf)
575-
buf = self.buf[:size]
576-
self.buf = self.buf[size:]
577-
return buf
577+
t = b"".join(t)
578+
self.buf = t[size:]
579+
return t[:size]
578580
# class _Stream
579581

580582
class _StreamProxy(object):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed a performance regression for reading streams with tarfile. The
2+
buffered read should use a list, instead of appending to a bytes object.

0 commit comments

Comments
 (0)