changeset: 96714:f2f5d1c928eb parent: 96705:0e1d9018e74b parent: 96713:98380a6e037c user: Jason R. Coombs date: Sun Jun 28 11:15:13 2015 -0400 files: Misc/NEWS description: Issue #20387: Merge with 3.5 diff -r 0e1d9018e74b -r f2f5d1c928eb Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Sun Jun 28 17:56:28 2015 +0300 +++ b/Lib/test/test_tokenize.py Sun Jun 28 11:15:13 2015 -0400 @@ -5,6 +5,8 @@ code, print out a table with tokens. The ENDMARKER is omitted for brevity. + >>> import glob + >>> dump_tokens("1 + 1") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1' (1, 0) (1, 1) @@ -835,7 +837,7 @@ open as tokenize_open, Untokenizer) from io import BytesIO from unittest import TestCase, mock -import os, sys, glob +import os import token def dump_tokens(s): @@ -1427,6 +1429,22 @@ self.assertEqual(untokenize(iter(tokens)), b'Hello ') +class TestRoundtrip(TestCase): + def roundtrip(self, code): + if isinstance(code, str): + code = code.encode('utf-8') + return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') + + def test_indentation_semantics_retained(self): + """ + Ensure that although whitespace might be mutated in a roundtrip, + the semantic meaning of the indentation remains consistent. + """ + code = "if False:\n\tx=3\n\tx=3\n" + codelines = self.roundtrip(code).split('\n') + self.assertEqual(codelines[1], codelines[2]) + + __test__ = {"doctests" : doctests, 'decistmt': decistmt} def test_main(): @@ -1437,6 +1455,7 @@ support.run_unittest(TestDetectEncoding) support.run_unittest(TestTokenize) support.run_unittest(UntokenizeTest) + support.run_unittest(TestRoundtrip) if __name__ == "__main__": test_main() diff -r 0e1d9018e74b -r f2f5d1c928eb Lib/tokenize.py --- a/Lib/tokenize.py Sun Jun 28 17:56:28 2015 +0300 +++ b/Lib/tokenize.py Sun Jun 28 11:15:13 2015 -0400 @@ -244,6 +244,8 @@ def untokenize(self, iterable): it = iter(iterable) + indents = [] + startline = False for t in it: if len(t) == 2: self.compat(t, it) @@ -254,6 +256,21 @@ continue if tok_type == ENDMARKER: break + if tok_type == INDENT: + indents.append(token) + continue + elif tok_type == DEDENT: + indents.pop() + self.prev_row, self.prev_col = end + continue + elif tok_type in (NEWLINE, NL): + startline = True + elif startline and indents: + indent = indents[-1] + if start[1] >= len(indent): + self.tokens.append(indent) + self.prev_col = len(indent) + startline = False self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end diff -r 0e1d9018e74b -r f2f5d1c928eb Misc/NEWS --- a/Misc/NEWS Sun Jun 28 17:56:28 2015 +0300 +++ b/Misc/NEWS Sun Jun 28 11:15:13 2015 -0400 @@ -47,6 +47,9 @@ Library ------- +- Issue #20387: Restore semantic round-trip correctness in tokenize/untokenize + for tab-indented blocks. + - Issue #24456: Fixed possible buffer over-read in adpcm2lin() and lin2adpcm() functions of the audioop module.