Skip to content

Commit c1b75b5

Browse files
bpo-34010: Fix tarfile read performance regression (GH-8020)
During buffered read, use a list followed by join instead of extending a bytes object. This is how it was done before but changed in commit b506dc3. (cherry picked from commit 12a08c4) Co-authored-by: hajoscher <[email protected]>
1 parent 2cbd1bb commit c1b75b5

File tree

2 files changed

+13
-9
lines changed

2 files changed

+13
-9
lines changed

‎Lib/tarfile.py‎

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ def read(self, size=None):
532532
if not buf:
533533
break
534534
t.append(buf)
535-
buf = "".join(t)
535+
buf = b"".join(t)
536536
else:
537537
buf = self._read(size)
538538
self.pos += len(buf)
@@ -545,6 +545,7 @@ def _read(self, size):
545545
return self.__read(size)
546546

547547
c = len(self.dbuf)
548+
t = [self.dbuf]
548549
while c < size:
549550
buf = self.__read(self.bufsize)
550551
if not buf:
@@ -553,26 +554,27 @@ def _read(self, size):
553554
buf = self.cmp.decompress(buf)
554555
except self.exception:
555556
raise ReadError("invalid compressed data")
556-
self.dbuf += buf
557+
t.append(buf)
557558
c += len(buf)
558-
buf = self.dbuf[:size]
559-
self.dbuf = self.dbuf[size:]
560-
return buf
559+
t = b"".join(t)
560+
self.dbuf = t[size:]
561+
return t[:size]
561562

562563
def __read(self, size):
563564
"""Return size bytes from stream. If internal buffer is empty,
564565
read another block from the stream.
565566
"""
566567
c = len(self.buf)
568+
t = [self.buf]
567569
while c < size:
568570
buf = self.fileobj.read(self.bufsize)
569571
if not buf:
570572
break
571-
self.buf += buf
573+
t.append(buf)
572574
c += len(buf)
573-
buf = self.buf[:size]
574-
self.buf = self.buf[size:]
575-
return buf
575+
t = b"".join(t)
576+
self.buf = t[size:]
577+
return t[:size]
576578
# class _Stream
577579

578580
class _StreamProxy(object):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed a performance regression for reading streams with tarfile. The
2+
buffered read should use a list, instead of appending to a bytes object.

0 commit comments

Comments
 (0)