Skip to content

Commit 76c1b4d

Browse files
authored
bpo-40334: Improve column offsets for thrown syntax errors by Pegen (GH-19782)
1 parent 719e14d commit 76c1b4d

File tree

6 files changed

+79
-111
lines changed

6 files changed

+79
-111
lines changed

‎Grammar/python.gram‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,7 @@ invalid_assignment:
609609
| expression ':' expression ['=' annotated_rhs] {
610610
RAISE_SYNTAX_ERROR("illegal target for annotation") }
611611
| a=expression ('=' | augassign) (yield_expr | star_expressions) {
612-
RAISE_SYNTAX_ERROR("cannot assign to %s", _PyPegen_get_expr_name(a)) }
612+
RAISE_SYNTAX_ERROR_NO_COL_OFFSET("cannot assign to %s", _PyPegen_get_expr_name(a)) }
613613
invalid_block:
614614
| NEWLINE !INDENT { RAISE_INDENTATION_ERROR("expected an indented block") }
615615
invalid_comprehension:

‎Lib/test/test_cmd_line_script.py‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ def test_syntaxerror_unindented_caret_position(self):
599599
exitcode, stdout, stderr = assert_python_failure(script_name)
600600
text = io.TextIOWrapper(io.BytesIO(stderr), 'ascii').read()
601601
# Confirm that the caret is located under the first 1 character
602-
self.assertIn("\n 1 + 1 = 2\n ^", text)
602+
self.assertIn("\n 1 + 1 = 2\n ^", text)
603603

604604
def test_syntaxerror_indented_caret_position(self):
605605
script = textwrap.dedent("""\
@@ -611,7 +611,7 @@ def test_syntaxerror_indented_caret_position(self):
611611
exitcode, stdout, stderr = assert_python_failure(script_name)
612612
text = io.TextIOWrapper(io.BytesIO(stderr), 'ascii').read()
613613
# Confirm that the caret is located under the first 1 character
614-
self.assertIn("\n 1 + 1 = 2\n ^", text)
614+
self.assertIn("\n 1 + 1 = 2\n ^", text)
615615

616616
# Try the same with a form feed at the start of the indented line
617617
script = (
@@ -622,7 +622,7 @@ def test_syntaxerror_indented_caret_position(self):
622622
exitcode, stdout, stderr = assert_python_failure(script_name)
623623
text = io.TextIOWrapper(io.BytesIO(stderr), "ascii").read()
624624
self.assertNotIn("\f", text)
625-
self.assertIn("\n 1 + 1 = 2\n ^", text)
625+
self.assertIn("\n 1 + 1 = 2\n ^", text)
626626

627627
def test_syntaxerror_multi_line_fstring(self):
628628
script = 'foo = f"""{}\nfoo"""\n'

‎Lib/test/test_exceptions.py‎

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -178,19 +178,19 @@ def ckmsg(src, msg, exception=SyntaxError):
178178
s = '''if True:\n print()\n\texec "mixed tabs and spaces"'''
179179
ckmsg(s, "inconsistent use of tabs and spaces in indentation", TabError)
180180

181-
@support.skip_if_new_parser("Pegen column offsets might be different")
182-
def testSyntaxErrorOffset(self):
183-
def check(src, lineno, offset, encoding='utf-8'):
184-
with self.assertRaises(SyntaxError) as cm:
185-
compile(src, '<fragment>', 'exec')
186-
self.assertEqual(cm.exception.lineno, lineno)
187-
self.assertEqual(cm.exception.offset, offset)
188-
if cm.exception.text is not None:
189-
if not isinstance(src, str):
190-
src = src.decode(encoding, 'replace')
191-
line = src.split('\n')[lineno-1]
192-
self.assertIn(line, cm.exception.text)
181+
def check(self, src, lineno, offset, encoding='utf-8'):
182+
with self.assertRaises(SyntaxError) as cm:
183+
compile(src, '<fragment>', 'exec')
184+
self.assertEqual(cm.exception.lineno, lineno)
185+
self.assertEqual(cm.exception.offset, offset)
186+
if cm.exception.text is not None:
187+
if not isinstance(src, str):
188+
src = src.decode(encoding, 'replace')
189+
line = src.split('\n')[lineno-1]
190+
self.assertIn(line, cm.exception.text)
193191

192+
def testSyntaxErrorOffset(self):
193+
check = self.check
194194
check('def fact(x):\n\treturn x!\n', 2, 10)
195195
check('1 +\n', 1, 4)
196196
check('def spam():\n print(1)\n print(2)', 3, 10)
@@ -238,20 +238,20 @@ def baz():
238238
check('nonlocal x', 1, 1)
239239
check('def f():\n global x\n nonlocal x', 2, 3)
240240

241-
# Errors thrown by ast.c
242-
check('for 1 in []: pass', 1, 5)
243-
check('def f(*):\n pass', 1, 7)
244-
check('[*x for x in xs]', 1, 2)
245-
check('def f():\n x, y: int', 2, 3)
246-
check('(yield i) = 2', 1, 1)
247-
check('foo(x for x in range(10), 100)', 1, 5)
248-
check('foo(1=2)', 1, 5)
249-
250241
# Errors thrown by future.c
251242
check('from __future__ import doesnt_exist', 1, 1)
252243
check('from __future__ import braces', 1, 1)
253244
check('x=1\nfrom __future__ import division', 2, 1)
254245

246+
@support.skip_if_new_parser("Pegen column offsets might be different")
247+
def testSyntaxErrorOffsetCustom(self):
248+
self.check('for 1 in []: pass', 1, 5)
249+
self.check('def f(*):\n pass', 1, 7)
250+
self.check('[*x for x in xs]', 1, 2)
251+
self.check('def f():\n x, y: int', 2, 3)
252+
self.check('(yield i) = 2', 1, 1)
253+
self.check('foo(x for x in range(10), 100)', 1, 5)
254+
self.check('foo(1=2)', 1, 5)
255255

256256
@cpython_only
257257
def testSettingException(self):

‎Parser/pegen/parse.c‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10515,7 +10515,7 @@ invalid_assignment_rule(Parser *p)
1051510515
(_tmp_132_var = _tmp_132_rule(p))
1051610516
)
1051710517
{
10518-
res = RAISE_SYNTAX_ERROR ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) );
10518+
res = RAISE_SYNTAX_ERROR_NO_COL_OFFSET ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) );
1051910519
if (res == NULL && PyErr_Occurred()) {
1052010520
p->error_indicator = 1;
1052110521
return NULL;

‎Parser/pegen/pegen.c‎

Lines changed: 47 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,15 @@ byte_offset_to_character_offset(PyObject *line, int col_offset)
145145
if (!str) {
146146
return 0;
147147
}
148-
PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL);
148+
PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
149149
if (!text) {
150150
return 0;
151151
}
152152
Py_ssize_t size = PyUnicode_GET_LENGTH(text);
153+
str = PyUnicode_AsUTF8(text);
154+
if (str != NULL && (int)strlen(str) == col_offset) {
155+
size = strlen(str);
156+
}
153157
Py_DECREF(text);
154158
return size;
155159
}
@@ -297,66 +301,21 @@ raise_tokenizer_init_error(PyObject *filename)
297301
}
298302

299303
static inline PyObject *
300-
get_error_line(char *buffer)
301-
{
302-
char *newline = strchr(buffer, '\n');
303-
if (newline) {
304-
return PyUnicode_FromStringAndSize(buffer, newline - buffer);
305-
}
306-
else {
307-
return PyUnicode_FromString(buffer);
308-
}
309-
}
310-
311-
static int
312-
tokenizer_error_with_col_offset(Parser *p, PyObject *errtype, const char *errmsg)
304+
get_error_line(char *buffer, int is_file)
313305
{
314-
PyObject *errstr = NULL;
315-
PyObject *value = NULL;
316-
size_t col_number = -1;
317-
318-
errstr = PyUnicode_FromString(errmsg);
319-
if (!errstr) {
320-
return -1;
321-
}
322-
323-
PyObject *loc = NULL;
324-
if (p->start_rule == Py_file_input) {
325-
loc = PyErr_ProgramTextObject(p->tok->filename, p->tok->lineno);
326-
}
327-
if (!loc) {
328-
loc = get_error_line(p->tok->buf);
306+
const char *newline;
307+
if (is_file) {
308+
newline = strrchr(buffer, '\n');
309+
} else {
310+
newline = strchr(buffer, '\n');
329311
}
330312

331-
if (loc) {
332-
col_number = p->tok->cur - p->tok->buf;
313+
if (newline) {
314+
return PyUnicode_DecodeUTF8(buffer, newline - buffer, "replace");
333315
}
334316
else {
335-
Py_INCREF(Py_None);
336-
loc = Py_None;
317+
return PyUnicode_DecodeUTF8(buffer, strlen(buffer), "replace");
337318
}
338-
339-
PyObject *tmp = Py_BuildValue("(OiiN)", p->tok->filename, p->tok->lineno,
340-
col_number, loc);
341-
if (!tmp) {
342-
goto error;
343-
}
344-
345-
value = PyTuple_Pack(2, errstr, tmp);
346-
Py_DECREF(tmp);
347-
if (!value) {
348-
goto error;
349-
}
350-
PyErr_SetObject(errtype, value);
351-
352-
Py_XDECREF(value);
353-
Py_XDECREF(errstr);
354-
return -1;
355-
356-
error:
357-
Py_XDECREF(errstr);
358-
Py_XDECREF(loc);
359-
return -1;
360319
}
361320

362321
static int
@@ -376,20 +335,20 @@ tokenizer_error(Parser *p)
376335
msg = "invalid character in identifier";
377336
break;
378337
case E_BADPREFIX:
379-
return tokenizer_error_with_col_offset(p,
380-
errtype, "invalid string prefix");
338+
RAISE_SYNTAX_ERROR("invalid string prefix");
339+
return -1;
381340
case E_EOFS:
382-
return tokenizer_error_with_col_offset(p,
383-
errtype, "EOF while scanning triple-quoted string literal");
341+
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
342+
return -1;
384343
case E_EOLS:
385-
return tokenizer_error_with_col_offset(p,
386-
errtype, "EOL while scanning string literal");
344+
RAISE_SYNTAX_ERROR("EOL while scanning string literal");
345+
return -1;
387346
case E_EOF:
388-
return tokenizer_error_with_col_offset(p,
389-
errtype, "unexpected EOF while parsing");
347+
RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
348+
return -1;
390349
case E_DEDENT:
391-
return tokenizer_error_with_col_offset(p,
392-
PyExc_IndentationError, "unindent does not match any outer indentation level");
350+
RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
351+
return -1;
393352
case E_INTR:
394353
if (!PyErr_Occurred()) {
395354
PyErr_SetNone(PyExc_KeyboardInterrupt);
@@ -421,14 +380,14 @@ tokenizer_error(Parser *p)
421380
}
422381

423382
void *
424-
_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
383+
_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...)
425384
{
426385
PyObject *value = NULL;
427386
PyObject *errstr = NULL;
428387
PyObject *loc = NULL;
429388
PyObject *tmp = NULL;
430389
Token *t = p->tokens[p->fill - 1];
431-
Py_ssize_t col_number = 0;
390+
Py_ssize_t col_number = !with_col_number;
432391
va_list va;
433392

434393
va_start(va, errmsg);
@@ -443,14 +402,20 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
443402
}
444403

445404
if (!loc) {
446-
loc = get_error_line(p->tok->buf);
405+
loc = get_error_line(p->tok->buf, p->start_rule == Py_file_input);
447406
}
448407

449-
if (loc) {
450-
int col_offset = t->col_offset == -1 ? 0 : t->col_offset;
451-
col_number = byte_offset_to_character_offset(loc, col_offset) + 1;
408+
if (loc && with_col_number) {
409+
int col_offset;
410+
if (t->col_offset == -1) {
411+
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - p->tok->buf,
412+
intptr_t, int);
413+
} else {
414+
col_offset = t->col_offset + 1;
415+
}
416+
col_number = byte_offset_to_character_offset(loc, col_offset);
452417
}
453-
else {
418+
else if (!loc) {
454419
Py_INCREF(Py_None);
455420
loc = Py_None;
456421
}
@@ -632,14 +597,6 @@ _PyPegen_fill_token(Parser *p)
632597
type = PyTokenizer_Get(p->tok, &start, &end);
633598
}
634599

635-
if (type == ERRORTOKEN) {
636-
if (p->tok->done == E_DECODE) {
637-
return raise_decode_error(p);
638-
}
639-
else {
640-
return tokenizer_error(p);
641-
}
642-
}
643600
if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) {
644601
type = NEWLINE; /* Add an extra newline */
645602
p->parsing_started = 0;
@@ -700,6 +657,16 @@ _PyPegen_fill_token(Parser *p)
700657
t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset;
701658

702659
p->fill += 1;
660+
661+
if (type == ERRORTOKEN) {
662+
if (p->tok->done == E_DECODE) {
663+
return raise_decode_error(p);
664+
}
665+
else {
666+
return tokenizer_error(p);
667+
}
668+
}
669+
703670
return 0;
704671
}
705672

‎Parser/pegen/pegen.h‎

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,14 +126,15 @@ expr_ty _PyPegen_name_token(Parser *p);
126126
expr_ty _PyPegen_number_token(Parser *p);
127127
void *_PyPegen_string_token(Parser *p);
128128
const char *_PyPegen_get_expr_name(expr_ty);
129-
void *_PyPegen_raise_error(Parser *p, PyObject *, const char *errmsg, ...);
129+
void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...);
130130
void *_PyPegen_dummy_name(Parser *p, ...);
131131

132132
#define UNUSED(expr) do { (void)(expr); } while (0)
133133
#define EXTRA_EXPR(head, tail) head->lineno, head->col_offset, tail->end_lineno, tail->end_col_offset, p->arena
134134
#define EXTRA start_lineno, start_col_offset, end_lineno, end_col_offset, p->arena
135-
#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, msg, ##__VA_ARGS__)
136-
#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, msg, ##__VA_ARGS__)
135+
#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 1, msg, ##__VA_ARGS__)
136+
#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, 1, msg, ##__VA_ARGS__)
137+
#define RAISE_SYNTAX_ERROR_NO_COL_OFFSET(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 0, msg, ##__VA_ARGS__)
137138

138139
Py_LOCAL_INLINE(void *)
139140
CHECK_CALL(Parser *p, void *result)
@@ -190,8 +191,8 @@ INVALID_VERSION_CHECK(Parser *p, int version, char *msg, void *node)
190191
}
191192
if (p->feature_version < version) {
192193
p->error_indicator = 1;
193-
return _PyPegen_raise_error(p, PyExc_SyntaxError, "%s only supported in Python 3.%i and greater",
194-
msg, version);
194+
return RAISE_SYNTAX_ERROR("%s only supported in Python 3.%i and greater",
195+
msg, version);
195196
}
196197
return node;
197198
}

0 commit comments

Comments
 (0)