From 91d387de7df9e19bb5b00e6ad4c94790eb3422e3 Mon Sep 17 00:00:00 2001
From: Damien <damien.p.george@gmail.com>
Date: Wed, 9 Oct 2013 15:09:52 +0100
Subject: [PATCH] Improve indent/dedent error checking and reporting.

---
 py/compile.c |  9 ++++++++-
 py/lexer.c   | 22 ++++++++++++++++------
 py/lexer.h   | 28 +++++++++++++++-------------
 py/main.c    | 12 +++++++-----
 py/parse.c   | 18 +++++++++++++-----
 5 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/py/compile.c b/py/compile.c
index f4a5886b0..3d5a29a19 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -2192,7 +2192,14 @@ void compile_node(compiler_t *comp, py_parse_node_t pn) {
             case PY_PARSE_NODE_DECIMAL: EMIT(load_const_dec, arg); break;
             case PY_PARSE_NODE_STRING: EMIT(load_const_str, arg, false); break;
             case PY_PARSE_NODE_BYTES: EMIT(load_const_str, arg, true); break;
-            case PY_PARSE_NODE_TOKEN: EMIT(load_const_tok, arg); break;
+            case PY_PARSE_NODE_TOKEN:
+                if (arg == PY_TOKEN_NEWLINE) {
+                    // this can occur when file_input lets through a NEWLINE (eg if file starts with a newline)
+                    // do nothing
+                } else {
+                  EMIT(load_const_tok, arg);
+                }
+                break;
             default: assert(0);
         }
     } else {
diff --git a/py/lexer.c b/py/lexer.c
index 9c2195ef5..656dc6d32 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -331,9 +331,7 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
         tok->kind = PY_TOKEN_INDENT;
         lex->emit_dent -= 1;
 
-    } else if (had_physical_newline && lex->nested_bracket_level == 0
-                   && tok != &lex->tok_cur // so that we don't emit a newline if file starts with a comment
-               ) {
+    } else if (had_physical_newline && lex->nested_bracket_level == 0) {
         tok->kind = PY_TOKEN_NEWLINE;
 
         uint num_spaces = lex->column - 1;
@@ -348,12 +346,11 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
                 lex->emit_dent -= 1;
             }
             if (num_spaces != indent_top(lex)) {
-                //SyntaxError
+                tok->kind = PY_TOKEN_DEDENT_MISMATCH;
             }
         }
 
     } else if (is_end(lex)) {
-        // TODO emit a newline if file does not end in one
         if (indent_top(lex) > 0) {
             tok->kind = PY_TOKEN_NEWLINE;
             lex->emit_dent = 0;
@@ -613,7 +610,15 @@ py_lexer_t *py_lexer_from_str_len(const char *src_name, const char *str, uint le
     }
 
     py_lexer_next_token_into(lex, &lex->tok_cur);
-    py_lexer_next_token_into(lex, &lex->tok_next);
+
+    // check that the first token is in the first column
+    // (done to get equivalence with CPython)
+    if (lex->tok_cur.src_line == 1 && lex->tok_cur.src_column != 1) {
+        lex->tok_next = lex->tok_cur;
+        lex->tok_cur.kind = PY_TOKEN_INDENT;
+    } else {
+        py_lexer_next_token_into(lex, &lex->tok_next);
+    }
 
     return lex;
 }
@@ -675,3 +680,8 @@ bool py_lexer_opt_str(py_lexer_t *lex, const char *str) {
 bool py_lexer_show_error(py_lexer_t *lex, const char *msg) {
     return py_token_show_error(&lex->tok_cur, msg);
 }
+
+bool py_lexer_show_error_pythonic(py_lexer_t *lex, const char *msg) {
+    printf("  File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
+    return false;
+}
diff --git a/py/lexer.h b/py/lexer.h
index 32ab48a08..948901259 100644
--- a/py/lexer.h
+++ b/py/lexer.h
@@ -12,20 +12,21 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_END,                   // 0
 
     PY_TOKEN_INVALID,
+    PY_TOKEN_DEDENT_MISMATCH,
     PY_TOKEN_LONELY_STRING_OPEN,
 
-    PY_TOKEN_NEWLINE,               // 3
-    PY_TOKEN_INDENT,                // 4
-    PY_TOKEN_DEDENT,                // 5
+    PY_TOKEN_NEWLINE,               // 4
+    PY_TOKEN_INDENT,                // 5
+    PY_TOKEN_DEDENT,                // 6
 
-    PY_TOKEN_NAME,                  // 6
+    PY_TOKEN_NAME,                  // 7
     PY_TOKEN_NUMBER,
     PY_TOKEN_STRING,
     PY_TOKEN_BYTES,
 
     PY_TOKEN_ELLIPSES,
 
-    PY_TOKEN_KW_FALSE,              // 11
+    PY_TOKEN_KW_FALSE,              // 12
     PY_TOKEN_KW_NONE,
     PY_TOKEN_KW_TRUE,
     PY_TOKEN_KW_AND,
@@ -34,7 +35,7 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_KW_BREAK,
     PY_TOKEN_KW_CLASS,
     PY_TOKEN_KW_CONTINUE,
-    PY_TOKEN_KW_DEF,                // 20
+    PY_TOKEN_KW_DEF,                // 21
     PY_TOKEN_KW_DEL,
     PY_TOKEN_KW_ELIF,
     PY_TOKEN_KW_ELSE,
@@ -44,7 +45,7 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_KW_FROM,
     PY_TOKEN_KW_GLOBAL,
     PY_TOKEN_KW_IF,
-    PY_TOKEN_KW_IMPORT,             // 30
+    PY_TOKEN_KW_IMPORT,             // 31
     PY_TOKEN_KW_IN,
     PY_TOKEN_KW_IS,
     PY_TOKEN_KW_LAMBDA,
@@ -54,12 +55,12 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_KW_PASS,
     PY_TOKEN_KW_RAISE,
     PY_TOKEN_KW_RETURN,
-    PY_TOKEN_KW_TRY,                // 40
+    PY_TOKEN_KW_TRY,                // 41
     PY_TOKEN_KW_WHILE,
     PY_TOKEN_KW_WITH,
     PY_TOKEN_KW_YIELD,
 
-    PY_TOKEN_OP_PLUS,               // 44
+    PY_TOKEN_OP_PLUS,               // 45
     PY_TOKEN_OP_MINUS,
     PY_TOKEN_OP_STAR,
     PY_TOKEN_OP_DBL_STAR,
@@ -69,7 +70,7 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_OP_LESS,
     PY_TOKEN_OP_DBL_LESS,
     PY_TOKEN_OP_MORE,
-    PY_TOKEN_OP_DBL_MORE,           // 54
+    PY_TOKEN_OP_DBL_MORE,           // 55
     PY_TOKEN_OP_AMPERSAND,
     PY_TOKEN_OP_PIPE,
     PY_TOKEN_OP_CARET,
@@ -79,7 +80,7 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_OP_DBL_EQUAL,
     PY_TOKEN_OP_NOT_EQUAL,
 
-    PY_TOKEN_DEL_PAREN_OPEN,        // 63
+    PY_TOKEN_DEL_PAREN_OPEN,        // 64
     PY_TOKEN_DEL_PAREN_CLOSE,
     PY_TOKEN_DEL_BRACKET_OPEN,
     PY_TOKEN_DEL_BRACKET_CLOSE,
@@ -89,7 +90,7 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_DEL_COLON,
     PY_TOKEN_DEL_PERIOD,
     PY_TOKEN_DEL_SEMICOLON,
-    PY_TOKEN_DEL_AT,                // 73
+    PY_TOKEN_DEL_AT,                // 74
     PY_TOKEN_DEL_EQUAL,
     PY_TOKEN_DEL_PLUS_EQUAL,
     PY_TOKEN_DEL_MINUS_EQUAL,
@@ -99,7 +100,7 @@ typedef enum _py_token_kind_t {
     PY_TOKEN_DEL_PERCENT_EQUAL,
     PY_TOKEN_DEL_AMPERSAND_EQUAL,
     PY_TOKEN_DEL_PIPE_EQUAL,
-    PY_TOKEN_DEL_CARET_EQUAL,       // 83
+    PY_TOKEN_DEL_CARET_EQUAL,       // 84
     PY_TOKEN_DEL_DBL_MORE_EQUAL,
     PY_TOKEN_DEL_DBL_LESS_EQUAL,
     PY_TOKEN_DEL_DBL_STAR_EQUAL,
@@ -137,5 +138,6 @@ bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind);
 bool py_lexer_opt_str(py_lexer_t *lex, const char *str);
 */
 bool py_lexer_show_error(py_lexer_t *lex, const char *msg);
+bool py_lexer_show_error_pythonic(py_lexer_t *lex, const char *msg);
 
 #endif /* INCLUDED_LEXER_H */
diff --git a/py/main.c b/py/main.c
index 7b17c38a8..cca52c4e7 100644
--- a/py/main.c
+++ b/py/main.c
@@ -31,11 +31,13 @@ int main(int argc, char **argv) {
         }
     } else {
         py_parse_node_t pn = py_parse(lex, 0);
-        //printf("----------------\n");
-        //parse_node_show(pn, 0);
-        //printf("----------------\n");
-        py_compile(pn);
-        //printf("----------------\n");
+        if (pn != PY_PARSE_NODE_NULL) {
+            //printf("----------------\n");
+            parse_node_show(pn, 0);
+            //printf("----------------\n");
+            py_compile(pn);
+            //printf("----------------\n");
+        }
     }
 
     py_lexer_free(lex);
diff --git a/py/parse.c b/py/parse.c
index 5eb4b27db..124d00ffe 100644
--- a/py/parse.c
+++ b/py/parse.c
@@ -545,10 +545,12 @@ py_parse_node_t py_parse(py_lexer_t *lex, int wanted_rule) {
                 assert(0);
         }
     }
+
+    // check we are at the end of the token stream
     if (!py_lexer_is_kind(lex, PY_TOKEN_END)) {
-        py_lexer_show_error(lex, "unexpected token at end:");
-        py_token_show(py_lexer_cur(lex));
+        goto syntax_error;
     }
+
     //printf("--------------\n");
     //result_stack_show(parser);
     assert(parser->result_stack_top == 1);
@@ -557,10 +559,16 @@ py_parse_node_t py_parse(py_lexer_t *lex, int wanted_rule) {
     return parser->result_stack[0];
 
 syntax_error:
-    py_lexer_show_error(lex, "syntax error:");
+    if (py_lexer_is_kind(lex, PY_TOKEN_INDENT)) {
+        py_lexer_show_error_pythonic(lex, "IndentationError: unexpected indent");
+    } else if (py_lexer_is_kind(lex, PY_TOKEN_DEDENT_MISMATCH)) {
+        py_lexer_show_error_pythonic(lex, "IndentationError: unindent does not match any outer indentation level");
+    } else {
+        py_lexer_show_error_pythonic(lex, "syntax error:");
 #ifdef USE_RULE_NAME
-    py_lexer_show_error(lex, rule->rule_name);
+        py_lexer_show_error(lex, rule->rule_name);
 #endif
-    py_token_show(py_lexer_cur(lex));
+        py_token_show(py_lexer_cur(lex));
+    }
     return PY_PARSE_NODE_NULL;
 }
-- 
GitLab