Skip to content

Commit 12a08c4

Browse files
hajoschermethane
authored andcommitted
bpo-34010: Fix tarfile read performance regression (GH-8020)
During buffered read, use a list followed by join instead of extending a bytes object. This is how it was done before but changed in commit b506dc3.
1 parent 97ae32c commit 12a08c4

File tree

2 files changed

+13
-9
lines changed

2 files changed

+13
-9
lines changed

‎Lib/tarfile.py‎

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ def read(self, size=None):
525525
if not buf:
526526
break
527527
t.append(buf)
528-
buf = "".join(t)
528+
buf = b"".join(t)
529529
else:
530530
buf = self._read(size)
531531
self.pos += len(buf)
@@ -538,6 +538,7 @@ def _read(self, size):
538538
return self.__read(size)
539539

540540
c = len(self.dbuf)
541+
t = [self.dbuf]
541542
while c < size:
542543
buf = self.__read(self.bufsize)
543544
if not buf:
@@ -546,26 +547,27 @@ def _read(self, size):
546547
buf = self.cmp.decompress(buf)
547548
except self.exception:
548549
raise ReadError("invalid compressed data")
549-
self.dbuf += buf
550+
t.append(buf)
550551
c += len(buf)
551-
buf = self.dbuf[:size]
552-
self.dbuf = self.dbuf[size:]
553-
return buf
552+
t = b"".join(t)
553+
self.dbuf = t[size:]
554+
return t[:size]
554555

555556
def __read(self, size):
556557
"""Return size bytes from stream. If internal buffer is empty,
557558
read another block from the stream.
558559
"""
559560
c = len(self.buf)
561+
t = [self.buf]
560562
while c < size:
561563
buf = self.fileobj.read(self.bufsize)
562564
if not buf:
563565
break
564-
self.buf += buf
566+
t.append(buf)
565567
c += len(buf)
566-
buf = self.buf[:size]
567-
self.buf = self.buf[size:]
568-
return buf
568+
t = b"".join(t)
569+
self.buf = t[size:]
570+
return t[:size]
569571
# class _Stream
570572

571573
class _StreamProxy(object):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed a performance regression for reading streams with tarfile. The
2+
buffered read should use a list, instead of appending to a bytes object.

0 commit comments

Comments
 (0)