@@ -145,11 +145,15 @@ byte_offset_to_character_offset(PyObject *line, int col_offset)
145145 if (!str ) {
146146 return 0 ;
147147 }
148- PyObject * text = PyUnicode_DecodeUTF8 (str , col_offset , NULL );
148+ PyObject * text = PyUnicode_DecodeUTF8 (str , col_offset , "replace" );
149149 if (!text ) {
150150 return 0 ;
151151 }
152152 Py_ssize_t size = PyUnicode_GET_LENGTH (text );
153+ str = PyUnicode_AsUTF8 (text );
154+ if (str != NULL && (int )strlen (str ) == col_offset ) {
155+ size = strlen (str );
156+ }
153157 Py_DECREF (text );
154158 return size ;
155159}
@@ -297,66 +301,21 @@ raise_tokenizer_init_error(PyObject *filename)
297301}
298302
299303static inline PyObject *
300- get_error_line (char * buffer )
301- {
302- char * newline = strchr (buffer , '\n' );
303- if (newline ) {
304- return PyUnicode_FromStringAndSize (buffer , newline - buffer );
305- }
306- else {
307- return PyUnicode_FromString (buffer );
308- }
309- }
310-
311- static int
312- tokenizer_error_with_col_offset (Parser * p , PyObject * errtype , const char * errmsg )
304+ get_error_line (char * buffer , int is_file )
313305{
314- PyObject * errstr = NULL ;
315- PyObject * value = NULL ;
316- size_t col_number = -1 ;
317-
318- errstr = PyUnicode_FromString (errmsg );
319- if (!errstr ) {
320- return -1 ;
321- }
322-
323- PyObject * loc = NULL ;
324- if (p -> start_rule == Py_file_input ) {
325- loc = PyErr_ProgramTextObject (p -> tok -> filename , p -> tok -> lineno );
326- }
327- if (!loc ) {
328- loc = get_error_line (p -> tok -> buf );
306+ const char * newline ;
307+ if (is_file ) {
308+ newline = strrchr (buffer , '\n' );
309+ } else {
310+ newline = strchr (buffer , '\n' );
329311 }
330312
331- if (loc ) {
332- col_number = p -> tok -> cur - p -> tok -> buf ;
313+ if (newline ) {
314+ return PyUnicode_DecodeUTF8 ( buffer , newline - buffer , "replace" ) ;
333315 }
334316 else {
335- Py_INCREF (Py_None );
336- loc = Py_None ;
317+ return PyUnicode_DecodeUTF8 (buffer , strlen (buffer ), "replace" );
337318 }
338-
339- PyObject * tmp = Py_BuildValue ("(OiiN)" , p -> tok -> filename , p -> tok -> lineno ,
340- col_number , loc );
341- if (!tmp ) {
342- goto error ;
343- }
344-
345- value = PyTuple_Pack (2 , errstr , tmp );
346- Py_DECREF (tmp );
347- if (!value ) {
348- goto error ;
349- }
350- PyErr_SetObject (errtype , value );
351-
352- Py_XDECREF (value );
353- Py_XDECREF (errstr );
354- return -1 ;
355-
356- error :
357- Py_XDECREF (errstr );
358- Py_XDECREF (loc );
359- return -1 ;
360319}
361320
362321static int
@@ -376,20 +335,20 @@ tokenizer_error(Parser *p)
376335 msg = "invalid character in identifier" ;
377336 break ;
378337 case E_BADPREFIX :
379- return tokenizer_error_with_col_offset ( p ,
380- errtype , "invalid string prefix" ) ;
338+ RAISE_SYNTAX_ERROR ( "invalid string prefix" );
339+ return -1 ;
381340 case E_EOFS :
382- return tokenizer_error_with_col_offset ( p ,
383- errtype , "EOF while scanning triple-quoted string literal" ) ;
341+ RAISE_SYNTAX_ERROR ( "EOF while scanning triple-quoted string literal" );
342+ return -1 ;
384343 case E_EOLS :
385- return tokenizer_error_with_col_offset ( p ,
386- errtype , "EOL while scanning string literal" ) ;
344+ RAISE_SYNTAX_ERROR ( "EOL while scanning string literal" );
345+ return -1 ;
387346 case E_EOF :
388- return tokenizer_error_with_col_offset ( p ,
389- errtype , "unexpected EOF while parsing" ) ;
347+ RAISE_SYNTAX_ERROR ( "unexpected EOF while parsing" );
348+ return -1 ;
390349 case E_DEDENT :
391- return tokenizer_error_with_col_offset ( p ,
392- PyExc_IndentationError , "unindent does not match any outer indentation level" ) ;
350+ RAISE_INDENTATION_ERROR ( "unindent does not match any outer indentation level" );
351+ return -1 ;
393352 case E_INTR :
394353 if (!PyErr_Occurred ()) {
395354 PyErr_SetNone (PyExc_KeyboardInterrupt );
@@ -421,14 +380,14 @@ tokenizer_error(Parser *p)
421380}
422381
423382void *
424- _PyPegen_raise_error (Parser * p , PyObject * errtype , const char * errmsg , ...)
383+ _PyPegen_raise_error (Parser * p , PyObject * errtype , int with_col_number , const char * errmsg , ...)
425384{
426385 PyObject * value = NULL ;
427386 PyObject * errstr = NULL ;
428387 PyObject * loc = NULL ;
429388 PyObject * tmp = NULL ;
430389 Token * t = p -> tokens [p -> fill - 1 ];
431- Py_ssize_t col_number = 0 ;
390+ Py_ssize_t col_number = ! with_col_number ;
432391 va_list va ;
433392
434393 va_start (va , errmsg );
@@ -443,14 +402,20 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
443402 }
444403
445404 if (!loc ) {
446- loc = get_error_line (p -> tok -> buf );
405+ loc = get_error_line (p -> tok -> buf , p -> start_rule == Py_file_input );
447406 }
448407
449- if (loc ) {
450- int col_offset = t -> col_offset == -1 ? 0 : t -> col_offset ;
451- col_number = byte_offset_to_character_offset (loc , col_offset ) + 1 ;
408+ if (loc && with_col_number ) {
409+ int col_offset ;
410+ if (t -> col_offset == -1 ) {
411+ col_offset = Py_SAFE_DOWNCAST (p -> tok -> cur - p -> tok -> buf ,
412+ intptr_t , int );
413+ } else {
414+ col_offset = t -> col_offset + 1 ;
415+ }
416+ col_number = byte_offset_to_character_offset (loc , col_offset );
452417 }
453- else {
418+ else if (! loc ) {
454419 Py_INCREF (Py_None );
455420 loc = Py_None ;
456421 }
@@ -632,14 +597,6 @@ _PyPegen_fill_token(Parser *p)
632597 type = PyTokenizer_Get (p -> tok , & start , & end );
633598 }
634599
635- if (type == ERRORTOKEN ) {
636- if (p -> tok -> done == E_DECODE ) {
637- return raise_decode_error (p );
638- }
639- else {
640- return tokenizer_error (p );
641- }
642- }
643600 if (type == ENDMARKER && p -> start_rule == Py_single_input && p -> parsing_started ) {
644601 type = NEWLINE ; /* Add an extra newline */
645602 p -> parsing_started = 0 ;
@@ -700,6 +657,16 @@ _PyPegen_fill_token(Parser *p)
700657 t -> end_col_offset = p -> tok -> lineno == 1 ? p -> starting_col_offset + end_col_offset : end_col_offset ;
701658
702659 p -> fill += 1 ;
660+
661+ if (type == ERRORTOKEN ) {
662+ if (p -> tok -> done == E_DECODE ) {
663+ return raise_decode_error (p );
664+ }
665+ else {
666+ return tokenizer_error (p );
667+ }
668+ }
669+
703670 return 0 ;
704671}
705672
0 commit comments