diff --git a/py/compile.c b/py/compile.c
index c1d49102be38d0f830e465cef711e8d36b82ec2a..0967c855c4eb351cf8fd11afdd47a8f511d13109 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -16,6 +16,8 @@
 
 // TODO need to mangle __attr names
 
+#define MICROPY_EMIT_NATIVE (MICROPY_EMIT_X64 || MICROPY_EMIT_THUMB)
+
 typedef enum {
     PN_none = 0,
 #define DEF_RULE(rule, comp, kind, arg...) PN_##rule,
@@ -853,16 +855,19 @@ static bool compile_built_in_decorator(compiler_t *comp, int name_len, py_parse_
     }
 
     qstr attr = PY_PARSE_NODE_LEAF_ARG(name_nodes[1]);
-    if (attr == comp->qstr_native) {
+    if (0) {
+#if MICROPY_EMIT_NATIVE
+    } else if (attr == comp->qstr_native) {
         *emit_options = EMIT_OPT_NATIVE_PYTHON;
     } else if (attr == comp->qstr_viper) {
         *emit_options = EMIT_OPT_VIPER;
+#endif
 #if MICROPY_EMIT_INLINE_THUMB
     } else if (attr == comp->qstr_asm_thumb) {
         *emit_options = EMIT_OPT_ASM_THUMB;
 #endif
     } else {
-        printf("SyntaxError: invalid micropython decorator\n");
+        printf("SyntaxError: invalid micropython decorator '%s'\n", qstr_str(attr));
     }
 
     return true;
@@ -1302,15 +1307,16 @@ void compile_while_stmt(compiler_t *comp, py_parse_node_struct_t *pns) {
     int old_break_label = comp->break_label;
     int old_continue_label = comp->continue_label;
 
-    int done_label = comp_next_label(comp);
-    int end_label = comp_next_label(comp);
     int break_label = comp_next_label(comp);
     int continue_label = comp_next_label(comp);
 
     comp->break_label = break_label;
     comp->continue_label = continue_label;
 
-    EMIT(setup_loop, end_label);
+    // compared to CPython, we have an optimised version of while loops
+#if MICROPY_EMIT_CPYTHON
+    int done_label = comp_next_label(comp);
+    EMIT(setup_loop, break_label);
     EMIT(label_assign, continue_label);
     c_if_cond(comp, pns->nodes[0], false, done_label); // condition
     compile_node(comp, pns->nodes[1]); // body
@@ -1318,21 +1324,27 @@ void compile_while_stmt(compiler_t *comp, py_parse_node_struct_t *pns) {
         EMIT(jump, continue_label);
     }
     EMIT(label_assign, done_label);
-
-    // break/continue apply to outer loop (if any) in the else block
-    comp->break_label = old_break_label;
-    comp->continue_label = old_continue_label;
-
     // CPython does not emit POP_BLOCK if the condition was a constant; don't undertand why
     // this is a small hack to agree with CPython
     if (!node_is_const_true(pns->nodes[0])) {
         EMIT(pop_block);
     }
+#else
+    int top_label = comp_next_label(comp);
+    EMIT(jump, continue_label);
+    EMIT(label_assign, top_label);
+    compile_node(comp, pns->nodes[1]); // body
+    EMIT(label_assign, continue_label);
+    c_if_cond(comp, pns->nodes[0], true, top_label); // condition
+#endif
+
+    // break/continue apply to outer loop (if any) in the else block
+    comp->break_label = old_break_label;
+    comp->continue_label = old_continue_label;
 
     compile_node(comp, pns->nodes[2]); // else
 
     EMIT(label_assign, break_label);
-    EMIT(label_assign, end_label);
 }
 
 void compile_for_stmt(compiler_t *comp, py_parse_node_struct_t *pns) {
@@ -1348,7 +1360,11 @@ void compile_for_stmt(compiler_t *comp, py_parse_node_struct_t *pns) {
     comp->continue_label = for_label;
     comp->break_label = break_label;
 
+    // I don't think our implementation needs SETUP_LOOP/POP_BLOCK for for-statements
+#if MICROPY_EMIT_CPYTHON
     EMIT(setup_loop, end_label);
+#endif
+
     compile_node(comp, pns->nodes[1]); // iterator
     EMIT(get_iter);
     EMIT(label_assign, for_label);
@@ -1365,7 +1381,9 @@ void compile_for_stmt(compiler_t *comp, py_parse_node_struct_t *pns) {
     comp->break_label = old_break_label;
     comp->continue_label = old_continue_label;
 
+#if MICROPY_EMIT_CPYTHON
     EMIT(pop_block);
+#endif
 
     compile_node(comp, pns->nodes[3]); // else (not tested)
 
diff --git a/py/nlr.h b/py/nlr.h
new file mode 100644
index 0000000000000000000000000000000000000000..b824e4a63a3a58e3ac3caa3a15f02522f41dbb96
--- /dev/null
+++ b/py/nlr.h
@@ -0,0 +1,26 @@
+// non-local return
+// exception handling, basically a stack of setjmp/longjmp buffers
+
+#include <limits.h>
+
+#ifndef __WORDSIZE
+#error __WORDSIZE needs to be defined
+#endif
+
+typedef struct _nlr_buf_t nlr_buf_t;
+struct _nlr_buf_t {
+    // the entries here must all be machine word size
+    nlr_buf_t *prev;
+    void *ret_val;
+#if __WORDSIZE == 32
+    void *regs[6];
+#elif __WORDSIZE == 64
+    void *regs[8];
+#else
+#error Unsupported __WORDSIZE
+#endif
+};
+
+unsigned int nlr_push(nlr_buf_t *);
+void nlr_pop();
+void nlr_jump(void *val) __attribute__((noreturn));
diff --git a/py/nlrx64.s b/py/nlrx64.s
new file mode 100644
index 0000000000000000000000000000000000000000..cf0eb3cd718231fa0f169359e8d0030e60aab02d
--- /dev/null
+++ b/py/nlrx64.s
@@ -0,0 +1,60 @@
+# x64 callee save: bx, bp, sp, r12, r14, r14, r15
+
+    .file   "nlr.s"
+    .text
+
+# uint nlr_push(rdi=nlr_buf_t *nlr)
+    .globl  nlr_push
+    .type   nlr_push, @function
+nlr_push:
+    movq    (%rsp), %rax            # load return %rip
+    movq    %rax, 16(%rdi)          # store %rip into nlr_buf
+    movq    %rbp, 24(%rdi)          # store %rbp into nlr_buf
+    movq    %rsp, 32(%rdi)          # store %rsp into nlr_buf
+    movq    %rbx, 40(%rdi)          # store %rbx into nlr_buf
+    movq    %r12, 48(%rdi)          # store %r12 into nlr_buf
+    movq    %r13, 56(%rdi)          # store %r13 into nlr_buf
+    movq    %r14, 64(%rdi)          # store %r14 into nlr_buf
+    movq    %r15, 72(%rdi)          # store %r15 into nlr_buf
+    movq    nlr_top(%rip), %rax     # get last nlr_buf
+    movq    %rax, (%rdi)            # store it
+    movq    %rdi, nlr_top(%rip)     # stor new nlr_buf (to make linked list)
+    xorq    %rax, %rax              # return 0, normal return
+    ret                             # return
+    .size   nlr_push, .-nlr_push
+
+# void nlr_pop()
+    .globl  nlr_pop
+    .type   nlr_pop, @function
+nlr_pop:
+    movq    nlr_top(%rip), %rax     # get nlr_top into %rax
+    movq    (%rax), %rax            # load prev nlr_buf
+    movq    %rax, nlr_top(%rip)     # store prev nlr_buf (to unlink list)
+    ret                             # return
+    .size   nlr_pop, .-nlr_pop
+
+# void nlr_jump(rdi=uint val)
+    .globl  nlr_jump
+    .type   nlr_jump, @function
+nlr_jump:
+    movq    %rdi, %rax              # put return value in %rax
+    movq    nlr_top(%rip), %rdi     # get nlr_top into %rdi
+    movq    %rax, 8(%rdi)           # store return value
+    movq    (%rdi), %rax            # load prev nlr_buf
+    movq    %rax, nlr_top(%rip)     # store prev nlr_buf (to unlink list)
+    movq    72(%rdi), %r15          # load saved %r15
+    movq    64(%rdi), %r14          # load saved %r14
+    movq    56(%rdi), %r13          # load saved %r13
+    movq    48(%rdi), %r12          # load saved %r12
+    movq    40(%rdi), %rbx          # load saved %rbx
+    movq    32(%rdi), %rsp          # load saved %rsp
+    movq    24(%rdi), %rbp          # load saved %rbp
+    movq    16(%rdi), %rax          # load saved %rip
+    movq    %rax, (%rsp)            # store saved %rip to stack
+    xorq    %rax, %rax              # clear return register
+    inc     %al                     # increase to make 1, non-local return
+    ret                             # return
+    .size   nlr_jump, .-nlr_jump
+
+    .local  nlr_top
+    .comm   nlr_top,8,8
diff --git a/py/nlrx86.s b/py/nlrx86.s
new file mode 100644
index 0000000000000000000000000000000000000000..364766da706ae24f2a81bc22c5ff07ce51e53263
--- /dev/null
+++ b/py/nlrx86.s
@@ -0,0 +1,57 @@
+# x86 callee save: bx, di, si, bp, sp
+
+    .file   "nlr.s"
+    .text
+
+# uint nlr_push(4(%esp)=nlr_buf_t *nlr)
+    .globl  nlr_push
+    .type   nlr_push, @function
+nlr_push:
+    mov     4(%esp), %edx           # load nlr_buf
+    mov     (%esp), %eax            # load return %ip
+    mov     %eax, 8(%edx)           # store %ip into nlr_buf+8
+    mov     %ebp, 12(%edx)          # store %bp into nlr_buf+12
+    mov     %esp, 16(%edx)          # store %sp into nlr_buf+16
+    mov     %ebx, 20(%edx)          # store %bx into nlr_buf+20
+    mov     %edi, 24(%edx)          # store %di into nlr_buf
+    mov     %esi, 28(%edx)          # store %si into nlr_buf
+    mov     nlr_top, %eax           # load nlr_top
+    mov     %eax, (%edx)            # store it
+    mov     %edx, nlr_top           # stor new nlr_buf (to make linked list)
+    xor     %eax, %eax              # return 0, normal return
+    ret                             # return
+    .size   nlr_push, .-nlr_push
+
+# void nlr_pop()
+    .globl  nlr_pop
+    .type   nlr_pop, @function
+nlr_pop:
+    mov     nlr_top, %eax           # load nlr_top
+    mov     (%eax), %eax            # load prev nlr_buf
+    mov     %eax, nlr_top           # store nlr_top (to unlink list)
+    ret                             # return
+    .size   nlr_pop, .-nlr_pop
+
+# void nlr_jump(4(%esp)=uint val)
+    .globl  nlr_jump
+    .type   nlr_jump, @function
+nlr_jump:
+    mov     nlr_top, %edx           # load nlr_top
+    mov     4(%esp), %eax           # load return value
+    mov     %eax, 4(%edx)           # store return value
+    mov     (%edx), %eax            # load prev nlr_top
+    mov     %eax, nlr_top           # store nlr_top (to unlink list)
+    mov     28(%edx), %esi          # load saved %si
+    mov     24(%edx), %edi          # load saved %di
+    mov     20(%edx), %ebx          # load saved %bx
+    mov     16(%edx), %esp          # load saved %sp
+    mov     12(%edx), %ebp          # load saved %bp
+    mov     8(%edx), %eax           # load saved %ip
+    mov     %eax, (%esp)            # store saved %ip to stack
+    xor     %eax, %eax              # clear return register
+    inc     %al                     # increase to make 1, non-local return
+    ret                             # return
+    .size   nlr_jump, .-nlr_jump
+
+    .local  nlr_top
+    .comm   nlr_top,4,4
diff --git a/py/runtime.c b/py/runtime.c
index f06c9203a25388b006b0f9da05f305ede3d1a1fc..e13adfaa4220b70fb07f64ae35b721c44d453b15 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -4,6 +4,7 @@
 #include <string.h>
 #include <assert.h>
 
+#include "nlr.h"
 #include "misc.h"
 #include "mpyconfig.h"
 #include "runtime.h"
@@ -36,6 +37,10 @@ typedef enum {
 #if MICROPY_ENABLE_FLOAT
     O_FLOAT,
 #endif
+    O_EXCEPTION_0,
+    O_EXCEPTION_2,
+    O_RANGE,
+    O_RANGE_IT,
     O_FUN_0,
     O_FUN_1,
     O_FUN_2,
@@ -77,6 +82,28 @@ struct _py_obj_base_t {
 #if MICROPY_ENABLE_FLOAT
         float_t u_flt;
 #endif
+        struct { // for O_EXCEPTION_0
+            qstr id;
+        } u_exc0;
+        struct { // for O_EXCEPTION_2
+            // TODO reduce size or make generic object or something
+            qstr id;
+            const char *fmt;
+            const char *s1;
+            const char *s2;
+        } u_exc2;
+        struct { // for O_RANGE
+            // TODO make generic object or something
+            machine_int_t start;
+            machine_int_t stop;
+            machine_int_t step;
+        } u_range;
+        struct { // for O_RANGE_IT
+            // TODO make generic object or something
+            machine_int_t cur;
+            machine_int_t stop;
+            machine_int_t step;
+        } u_range_it;
         struct { // for O_FUN_[012N]
             int n_args;
             void *fun;
@@ -118,6 +145,7 @@ struct _py_obj_base_t {
 py_obj_t py_const_none;
 py_obj_t py_const_false;
 py_obj_t py_const_true;
+py_obj_t py_const_stop_iteration;
 
 // locals and globals need to be pointers because they can be the same in outer module scope
 py_map_t *map_locals;
@@ -266,6 +294,42 @@ py_obj_t py_obj_new_float(float_t val) {
 }
 #endif
 
+py_obj_t py_obj_new_exception_0(qstr id) {
+    py_obj_base_t *o = m_new(py_obj_base_t, 1);
+    o->kind = O_EXCEPTION_0;
+    o->u_exc0.id = id;
+    return (py_obj_t)o;
+}
+
+py_obj_t py_obj_new_exception_2(qstr id, const char *fmt, const char *s1, const char *s2) {
+    py_obj_base_t *o = m_new(py_obj_base_t, 1);
+    o->kind = O_EXCEPTION_2;
+    o->u_exc2.id = id;
+    o->u_exc2.fmt = fmt;
+    o->u_exc2.s1 = s1;
+    o->u_exc2.s2 = s2;
+    return (py_obj_t)o;
+}
+
+// range is a class and instances are immutable sequence objects
+py_obj_t py_obj_new_range(int start, int stop, int step) {
+    py_obj_base_t *o = m_new(py_obj_base_t, 1);
+    o->kind = O_RANGE;
+    o->u_range.start = start;
+    o->u_range.stop = stop;
+    o->u_range.step = step;
+    return o;
+}
+
+py_obj_t py_obj_new_range_iterator(int cur, int stop, int step) {
+    py_obj_base_t *o = m_new(py_obj_base_t, 1);
+    o->kind = O_RANGE_IT;
+    o->u_range_it.cur = cur;
+    o->u_range_it.stop = stop;
+    o->u_range_it.step = step;
+    return o;
+}
+
 py_obj_t list_append(py_obj_t self_in, py_obj_t arg) {
     assert(IS_O(self_in, O_LIST));
     py_obj_base_t *self = self_in;
@@ -281,6 +345,9 @@ static qstr q_append;
 static qstr q_print;
 static qstr q_len;
 static qstr q___build_class__;
+static qstr q_AttributeError;
+static qstr q_NameError;
+static qstr q_TypeError;
 
 typedef enum {
     PY_CODE_NONE,
@@ -356,6 +423,10 @@ py_obj_t py_builtin___build_class__(py_obj_t o_class_fun, py_obj_t o_class_name)
     return o;
 }
 
+py_obj_t py_builtin_range(py_obj_t o_arg) {
+    return py_obj_new_range(0, rt_get_int(o_arg), 1);
+}
+
 #ifdef WRITE_NATIVE
 FILE *fp_native = NULL;
 #endif
@@ -365,10 +436,14 @@ void rt_init() {
     q_print = qstr_from_str_static("print");
     q_len = qstr_from_str_static("len");
     q___build_class__ = qstr_from_str_static("__build_class__");
+    q_AttributeError = qstr_from_str_static("AttributeError");
+    q_NameError = qstr_from_str_static("NameError");
+    q_TypeError = qstr_from_str_static("TypeError");
 
     py_const_none = py_obj_new_const("None");
     py_const_false = py_obj_new_const("False");
     py_const_true = py_obj_new_const("True");
+    py_const_stop_iteration = py_obj_new_const("StopIteration");
 
     // locals = globals for outer module (see Objects/frameobject.c/PyFrame_New())
     map_locals = map_globals = py_map_new(MAP_QSTR, 1);
@@ -378,6 +453,7 @@ void rt_init() {
     py_qstr_map_lookup(&map_builtins, q_print, true)->value = rt_make_function_1(py_builtin_print);
     py_qstr_map_lookup(&map_builtins, q_len, true)->value = rt_make_function_1(py_builtin_len);
     py_qstr_map_lookup(&map_builtins, q___build_class__, true)->value = rt_make_function_2(py_builtin___build_class__);
+    py_qstr_map_lookup(&map_builtins, qstr_from_str_static("range"), true)->value = rt_make_function_1(py_builtin_range);
 
     next_unique_code_id = 1;
     unique_codes = NULL;
@@ -559,6 +635,10 @@ void py_obj_print(py_obj_t o_in) {
                 printf("%f", o->u_flt);
                 break;
 #endif
+            case O_EXCEPTION_2:
+                printf("%s: ", qstr_str(o->u_exc2.id));
+                printf(o->u_exc2.fmt, o->u_exc2.s1, o->u_exc2.s2);
+                break;
             case O_LIST:
                 printf("[");
                 for (int i = 0; i < o->u_list.len; i++) {
@@ -653,8 +733,7 @@ py_obj_t rt_load_name(qstr qstr) {
         if (elem == NULL) {
             elem = py_qstr_map_lookup(&map_builtins, qstr, false);
             if (elem == NULL) {
-                printf("name doesn't exist: %s\n", qstr_str(qstr));
-                assert(0);
+                nlr_jump(py_obj_new_exception_2(q_NameError, "name '%s' is not defined", qstr_str(qstr), NULL));
             }
         }
     }
@@ -668,8 +747,7 @@ py_obj_t rt_load_global(qstr qstr) {
     if (elem == NULL) {
         elem = py_qstr_map_lookup(&map_builtins, qstr, false);
         if (elem == NULL) {
-            printf("name doesn't exist: %s\n", qstr_str(qstr));
-            assert(0);
+            nlr_jump(py_obj_new_exception_2(q_NameError, "name '%s' is not defined", qstr_str(qstr), NULL));
         }
     }
     return elem->value;
@@ -1123,9 +1201,7 @@ py_obj_t rt_load_attr(py_obj_t base, qstr attr) {
     }
 
 no_attr:
-    printf("AttributeError: '%s' object has no attribute '%s'\n", py_obj_get_type_str(base), qstr_str(attr));
-    assert(0);
-    return py_const_none;
+    nlr_jump(py_obj_new_exception_2(q_AttributeError, "'%s' object has no attribute '%s'", py_obj_get_type_str(base), qstr_str(attr)));
 }
 
 void rt_load_method(py_obj_t base, qstr attr, py_obj_t *dest) {
@@ -1204,6 +1280,30 @@ void rt_store_subscr(py_obj_t base, py_obj_t index, py_obj_t value) {
     }
 }
 
+py_obj_t rt_getiter(py_obj_t o_in) {
+    if (IS_O(o_in, O_RANGE)) {
+        py_obj_base_t *o = o_in;
+        return py_obj_new_range_iterator(o->u_range.start, o->u_range.stop, o->u_range.step);
+    } else {
+        nlr_jump(py_obj_new_exception_2(q_TypeError, "'%s' object is not iterable", py_obj_get_type_str(o_in), NULL));
+    }
+}
+
+py_obj_t rt_iternext(py_obj_t o_in) {
+    if (IS_O(o_in, O_RANGE_IT)) {
+        py_obj_base_t *o = o_in;
+        if ((o->u_range_it.step > 0 && o->u_range_it.cur < o->u_range_it.stop) || (o->u_range_it.step < 0 && o->u_range_it.cur > o->u_range_it.stop)) {
+            py_obj_t o_out = TO_SMALL_INT(o->u_range_it.cur);
+            o->u_range_it.cur += o->u_range_it.step;
+            return o_out;
+        } else {
+            return py_const_stop_iteration;
+        }
+    } else {
+        nlr_jump(py_obj_new_exception_2(q_TypeError, "? '%s' object is not iterable", py_obj_get_type_str(o_in), NULL));
+    }
+}
+
 void *rt_fun_table[RT_F_NUMBER_OF] = {
     rt_load_const_str,
     rt_load_name,
diff --git a/py/runtime.h b/py/runtime.h
index 2823ba187d54f31a59f27647c7f369098b786362..c36f8d8e7e852d7fb517ab52d49f2914e64bf48c 100644
--- a/py/runtime.h
+++ b/py/runtime.h
@@ -82,6 +82,7 @@ typedef py_obj_t (*py_fun_t)();
 extern py_obj_t py_const_none;
 extern py_obj_t py_const_false;
 extern py_obj_t py_const_true;
+extern py_obj_t py_const_stop_iteration; // special object indicating end of iteration (not StopIteration exception!)
 
 void rt_init();
 void rt_deinit();
@@ -123,3 +124,5 @@ py_obj_t rt_load_attr(py_obj_t base, qstr attr);
 void rt_load_method(py_obj_t base, qstr attr, py_obj_t *dest);
 void rt_store_attr(py_obj_t base, qstr attr, py_obj_t val);
 void rt_store_subscr(py_obj_t base, py_obj_t index, py_obj_t val);
+py_obj_t rt_getiter(py_obj_t o);
+py_obj_t rt_iternext(py_obj_t o);
diff --git a/py/vm.c b/py/vm.c
index d6740cf04d5302fbc838ae276d560dcc9548ea7e..e92229a07332734f880c4df3f950e7fa26c14f04 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -4,6 +4,7 @@
 #include <string.h>
 #include <assert.h>
 
+#include "nlr.h"
 #include "misc.h"
 #include "mpyconfig.h"
 #include "runtime.h"
@@ -24,6 +25,7 @@ py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args,
     qstr qstr;
     py_obj_t obj1, obj2;
     py_obj_t fast0 = NULL, fast1 = NULL, fast2 = NULL, fastn[4] = {NULL, NULL, NULL, NULL};
+    nlr_buf_t nlr;
 
     // init args
     for (int i = 0; i < n_args; i++) {
@@ -39,224 +41,290 @@ py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args,
         }
     }
 
-    // execute byte code
+    // outer exception handling loop
     for (;;) {
-        int op = *ip++;
-        switch (op) {
-            case PYBC_LOAD_CONST_FALSE:
-                PUSH(py_const_false);
-                break;
-
-            case PYBC_LOAD_CONST_NONE:
-                PUSH(py_const_none);
-                break;
-
-            case PYBC_LOAD_CONST_TRUE:
-                PUSH(py_const_true);
-                break;
-
-            case PYBC_LOAD_CONST_SMALL_INT:
-                snum = ip[0] | (ip[1] << 8);
-                if (snum & 0x8000) {
-                    snum |= ~0xffff;
+        if (nlr_push(&nlr) == 0) {
+            // loop to execute byte code
+            for (;;) {
+                int op = *ip++;
+                switch (op) {
+                    case PYBC_LOAD_CONST_FALSE:
+                        PUSH(py_const_false);
+                        break;
+
+                    case PYBC_LOAD_CONST_NONE:
+                        PUSH(py_const_none);
+                        break;
+
+                    case PYBC_LOAD_CONST_TRUE:
+                        PUSH(py_const_true);
+                        break;
+
+                    case PYBC_LOAD_CONST_SMALL_INT:
+                        snum = ip[0] | (ip[1] << 8);
+                        if (snum & 0x8000) {
+                            snum |= ~0xffff;
+                        }
+                        ip += 2;
+                        PUSH((py_obj_t)(snum << 1 | 1));
+                        break;
+
+                    case PYBC_LOAD_CONST_ID:
+                        DECODE_QSTR;
+                        PUSH(rt_load_const_str(qstr)); // TODO
+                        break;
+
+                    case PYBC_LOAD_CONST_STRING:
+                        DECODE_QSTR;
+                        PUSH(rt_load_const_str(qstr));
+                        break;
+
+                    case PYBC_LOAD_FAST_0:
+                        PUSH(fast0);
+                        break;
+
+                    case PYBC_LOAD_FAST_1:
+                        PUSH(fast1);
+                        break;
+
+                    case PYBC_LOAD_FAST_2:
+                        PUSH(fast2);
+                        break;
+
+                    case PYBC_LOAD_FAST_N:
+                        DECODE_UINT;
+                        PUSH(fastn[unum - 3]);
+                        break;
+
+                    case PYBC_LOAD_NAME:
+                        DECODE_QSTR;
+                        PUSH(rt_load_name(qstr));
+                        break;
+
+                    case PYBC_LOAD_GLOBAL:
+                        DECODE_QSTR;
+                        PUSH(rt_load_global(qstr));
+                        break;
+
+                    case PYBC_LOAD_ATTR:
+                        DECODE_QSTR;
+                        *sp = rt_load_attr(*sp, qstr);
+                        break;
+
+                    case PYBC_LOAD_METHOD:
+                        DECODE_QSTR;
+                        sp -= 1;
+                        rt_load_method(sp[1], qstr, sp);
+                        break;
+
+                    case PYBC_LOAD_BUILD_CLASS:
+                        PUSH(rt_load_build_class());
+                        break;
+
+                    case PYBC_STORE_FAST_0:
+                        fast0 = POP();
+                        break;
+
+                    case PYBC_STORE_FAST_1:
+                        fast1 = POP();
+                        break;
+
+                    case PYBC_STORE_FAST_2:
+                        fast2 = POP();
+                        break;
+
+                    case PYBC_STORE_FAST_N:
+                        DECODE_UINT;
+                        fastn[unum - 3] = POP();
+                        break;
+
+                    case PYBC_STORE_NAME:
+                        DECODE_QSTR;
+                        rt_store_name(qstr, POP());
+                        break;
+
+                    case PYBC_STORE_ATTR:
+                        DECODE_QSTR;
+                        rt_store_attr(sp[0], qstr, sp[1]);
+                        sp += 2;
+                        break;
+
+                    case PYBC_STORE_SUBSCR:
+                        rt_store_subscr(sp[1], sp[0], sp[2]);
+                        sp += 3;
+                        break;
+
+                    case PYBC_DUP_TOP:
+                        obj1 = *sp;
+                        PUSH(obj1);
+                        break;
+
+                    case PYBC_DUP_TOP_TWO:
+                        sp -= 2;
+                        sp[0] = sp[2];
+                        sp[1] = sp[3];
+                        break;
+
+                    case PYBC_POP_TOP:
+                        ++sp;
+                        break;
+
+                    case PYBC_ROT_THREE:
+                        obj1 = sp[0];
+                        sp[0] = sp[1];
+                        sp[1] = sp[2];
+                        sp[2] = obj1;
+                        break;
+
+                    case PYBC_JUMP:
+                        DECODE_UINT;
+                        ip = code + unum;
+                        break;
+
+                    case PYBC_POP_JUMP_IF_TRUE:
+                        DECODE_UINT;
+                        if (rt_is_true(POP())) {
+                            ip = code + unum;
+                        }
+                        break;
+
+                    case PYBC_POP_JUMP_IF_FALSE:
+                        DECODE_UINT;
+                        if (!rt_is_true(POP())) {
+                            ip = code + unum;
+                        }
+                        break;
+
+                        /* we are trying to get away without using this opcode
+                    case PYBC_SETUP_LOOP:
+                        DECODE_UINT;
+                        // push_block(PYBC_SETUP_LOOP, code + unum, sp)
+                        break;
+                        */
+
+                    case PYBC_SETUP_EXCEPT:
+                        // push_block(PYBC_SETUP_EXCEPT, code + unum, sp)
+                        assert(0);
+                        break;
+
+                    case PYBC_END_FINALLY:
+                        // not implemented
+                        // if TOS is an exception, reraises the exception (3 values on TOS)
+                        // if TOS is an integer, does something else
+                        // if TOS is None, just pops it and continues
+                        // else error
+                        assert(0);
+                        break;
+
+                    case PYBC_GET_ITER:
+                        *sp = rt_getiter(*sp);
+                        break;
+
+                    case PYBC_FOR_ITER:
+                        DECODE_UINT; // the jump offset if iteration finishes
+                        obj1 = rt_iternext(*sp);
+                        if (obj1 == py_const_stop_iteration) {
+                            ++sp; // pop the exhausted iterator
+                            ip = code + unum; // jump to after for-block
+                        } else {
+                            PUSH(obj1); // push the next iteration value
+                        }
+                        break;
+
+                    case PYBC_POP_BLOCK:
+                        // pops block and restores the stack
+                        assert(0);
+                        break;
+
+                    case PYBC_POP_EXCEPT:
+                        // pops block, checks it's an exception block, and restores the stack, saving the 3 exception values to local threadstate
+                        assert(0);
+                        break;
+
+                    case PYBC_BINARY_OP:
+                        unum = *ip++;
+                        obj2 = POP();
+                        obj1 = *sp;
+                        *sp = rt_binary_op(unum, obj1, obj2);
+                        break;
+
+                    case PYBC_COMPARE_OP:
+                        unum = *ip++;
+                        obj2 = POP();
+                        obj1 = *sp;
+                        *sp = rt_compare_op(unum, obj1, obj2);
+                        break;
+
+                    case PYBC_BUILD_LIST:
+                        DECODE_UINT;
+                        obj1 = rt_build_list(unum, sp);
+                        sp += unum - 1;
+                        *sp = obj1;
+                        break;
+
+                    case PYBC_BUILD_MAP:
+                        DECODE_UINT;
+                        PUSH(rt_build_map(unum));
+                        break;
+
+                    case PYBC_STORE_MAP:
+                        sp += 2;
+                        rt_store_map(sp[0], sp[-2], sp[-1]);
+                        break;
+
+                    case PYBC_BUILD_SET:
+                        DECODE_UINT;
+                        obj1 = rt_build_set(unum, sp);
+                        sp += unum - 1;
+                        *sp = obj1;
+                        break;
+
+                    case PYBC_MAKE_FUNCTION:
+                        DECODE_UINT;
+                        PUSH(rt_make_function_from_id(unum));
+                        break;
+
+                    case PYBC_CALL_FUNCTION:
+                        DECODE_UINT;
+                        assert((unum & 0xff00) == 0); // n_keyword
+                        unum &= 0xff; // n_positional
+                        sp += unum;
+                        *sp = rt_call_function_n(*sp, unum, sp - unum);
+                        break;
+
+                    case PYBC_CALL_METHOD:
+                        DECODE_UINT;
+                        assert((unum & 0xff00) == 0); // n_keyword
+                        unum &= 0xff;
+                        obj1 = rt_call_method_n(unum, sp);
+                        sp += unum + 1;
+                        *sp = obj1;
+                        break;
+
+                    case PYBC_RETURN_VALUE:
+                        nlr_pop();
+                        return *sp;
+
+                    default:
+                        printf("code %p, offset %u, byte code 0x%02x not implemented\n", code, (uint)(ip - code), op);
+                        assert(0);
+                        nlr_pop();
+                        return py_const_none;
                 }
-                ip += 2;
-                PUSH((py_obj_t)(snum << 1 | 1));
-                break;
-
-            case PYBC_LOAD_CONST_ID:
-                DECODE_QSTR;
-                PUSH(rt_load_const_str(qstr)); // TODO
-                break;
-
-            case PYBC_LOAD_CONST_STRING:
-                DECODE_QSTR;
-                PUSH(rt_load_const_str(qstr));
-                break;
-
-            case PYBC_LOAD_FAST_0:
-                PUSH(fast0);
-                break;
-
-            case PYBC_LOAD_FAST_1:
-                PUSH(fast1);
-                break;
-
-            case PYBC_LOAD_FAST_2:
-                PUSH(fast2);
-                break;
-
-            case PYBC_LOAD_FAST_N:
-                DECODE_UINT;
-                PUSH(fastn[unum - 3]);
-                break;
-
-            case PYBC_LOAD_NAME:
-                DECODE_QSTR;
-                PUSH(rt_load_name(qstr));
-                break;
-
-            case PYBC_LOAD_GLOBAL:
-                DECODE_QSTR;
-                PUSH(rt_load_global(qstr));
-                break;
-
-            case PYBC_LOAD_ATTR:
-                DECODE_QSTR;
-                *sp = rt_load_attr(*sp, qstr);
-                break;
-
-            case PYBC_LOAD_METHOD:
-                DECODE_QSTR;
-                sp -= 1;
-                rt_load_method(sp[1], qstr, sp);
-                break;
-
-            case PYBC_LOAD_BUILD_CLASS:
-                PUSH(rt_load_build_class());
-                break;
-
-            case PYBC_STORE_FAST_0:
-                fast0 = POP();
-                break;
-
-            case PYBC_STORE_FAST_1:
-                fast1 = POP();
-                break;
-
-            case PYBC_STORE_FAST_2:
-                fast2 = POP();
-                break;
-
-            case PYBC_STORE_FAST_N:
-                DECODE_UINT;
-                fastn[unum - 3] = POP();
-                break;
-
-            case PYBC_STORE_NAME:
-                DECODE_QSTR;
-                rt_store_name(qstr, POP());
-                break;
-
-            case PYBC_STORE_ATTR:
-                DECODE_QSTR;
-                rt_store_attr(sp[0], qstr, sp[1]);
-                sp += 2;
-                break;
-
-            case PYBC_STORE_SUBSCR:
-                rt_store_subscr(sp[1], sp[0], sp[2]);
-                sp += 3;
-                break;
-
-            case PYBC_DUP_TOP:
-                obj1 = *sp;
-                PUSH(obj1);
-                break;
-
-            case PYBC_DUP_TOP_TWO:
-                sp -= 2;
-                sp[0] = sp[2];
-                sp[1] = sp[3];
-                break;
-
-            case PYBC_POP_TOP:
-                ++sp;
-                break;
-
-            case PYBC_ROT_THREE:
-                obj1 = sp[0];
-                sp[0] = sp[1];
-                sp[1] = sp[2];
-                sp[2] = obj1;
-                break;
-
-            case PYBC_JUMP:
-                DECODE_UINT;
-                ip = code + unum;
-                break;
-
-            case PYBC_POP_JUMP_IF_FALSE:
-                DECODE_UINT;
-                if (!rt_is_true(POP())) {
-                    ip = code + unum;
-                }
-                break;
-
-            case PYBC_SETUP_LOOP:
-                DECODE_UINT;
-                break;
-
-            case PYBC_POP_BLOCK:
-                break;
-
-            case PYBC_BINARY_OP:
-                unum = *ip++;
-                obj2 = POP();
-                obj1 = *sp;
-                *sp = rt_binary_op(unum, obj1, obj2);
-                break;
-
-            case PYBC_COMPARE_OP:
-                unum = *ip++;
-                obj2 = POP();
-                obj1 = *sp;
-                *sp = rt_compare_op(unum, obj1, obj2);
-                break;
-
-            case PYBC_BUILD_LIST:
-                DECODE_UINT;
-                obj1 = rt_build_list(unum, sp);
-                sp += unum - 1;
-                *sp = obj1;
-                break;
-
-            case PYBC_BUILD_MAP:
-                DECODE_UINT;
-                PUSH(rt_build_map(unum));
-                break;
-
-            case PYBC_STORE_MAP:
-                sp += 2;
-                rt_store_map(sp[0], sp[-2], sp[-1]);
-                break;
-
-            case PYBC_BUILD_SET:
-                DECODE_UINT;
-                obj1 = rt_build_set(unum, sp);
-                sp += unum - 1;
-                *sp = obj1;
-                break;
-
-            case PYBC_MAKE_FUNCTION:
-                DECODE_UINT;
-                PUSH(rt_make_function_from_id(unum));
-                break;
-
-            case PYBC_CALL_FUNCTION:
-                DECODE_UINT;
-                assert((unum & 0xff00) == 0); // n_keyword
-                unum &= 0xff; // n_positional
-                sp += unum;
-                *sp = rt_call_function_n(*sp, unum, sp - unum);
-                break;
-
-            case PYBC_CALL_METHOD:
-                DECODE_UINT;
-                assert((unum & 0xff00) == 0); // n_keyword
-                unum &= 0xff;
-                obj1 = rt_call_method_n(unum, sp);
-                sp += unum + 1;
-                *sp = obj1;
-                break;
-
-            case PYBC_RETURN_VALUE:
-                return *sp;
-
-            default:
-                printf("code %p, offset %u, byte code 0x%02x not implemented\n", code, (uint)(ip - code), op);
-                assert(0);
-                return py_const_none;
+            }
+
+        } else {
+            // exception occurred
+
+            if (0) {
+                // catch exception and pass to byte code
+                //ip = pop
+                //sp = pop
+                //push(traceback, exc-val, exc-type)
+            } else {
+                // re-raise exception
+                nlr_jump(nlr.ret_val);
+            }
         }
     }
 }
diff --git a/unix/Makefile b/unix/Makefile
index 6a48cee221fd1a2245249c8233036d36da8f6d83..0c8c449e7751d2b923a4b6fdadfe579ad55143b8 100644
--- a/unix/Makefile
+++ b/unix/Makefile
@@ -3,13 +3,13 @@ BUILD=build
 
 CC = gcc
 CFLAGS = -I. -I$(PYSRC) -Wall -ansi -std=gnu99 -Os #-DNDEBUG
-CFLAGS_PY = -DEMIT_ENABLE_CPY -DEMIT_ENABLE_THUMB
 LDFLAGS =
 
 SRC_C = \
 	main.c \
 
 PY_O = \
+	nlrx64.o \
 	malloc.o \
 	qstr.o \
 	misc.o \
@@ -43,19 +43,23 @@ $(BUILD):
 $(BUILD)/%.o: %.c
 	$(CC) $(CFLAGS) -c -o $@ $<
 
+$(BUILD)/%.o: $(PYSRC)/%.s
+	$(AS) -c -o $@ $<
+
 $(BUILD)/%.o: $(PYSRC)/%.c mpyconfig.h
-	$(CC) $(CFLAGS) $(CFLAGS_PY) -c -o $@ $<
+	$(CC) $(CFLAGS) -c -o $@ $<
 
 $(BUILD)/emitnx64.o: $(PYSRC)/emitnative.c $(PYSRC)/emit.h
-	$(CC) $(CFLAGS) $(CFLAGS_PY) -DN_X64 -c -o $@ $<
+	$(CC) $(CFLAGS) -DN_X64 -c -o $@ $<
 
 $(BUILD)/emitnthumb.o: $(PYSRC)/emitnative.c $(PYSRC)/emit.h
-	$(CC) $(CFLAGS) $(CFLAGS_PY) -DN_THUMB -c -o $@ $<
+	$(CC) $(CFLAGS) -DN_THUMB -c -o $@ $<
 
 # optimising vm for speed, adds only a small amount to code size but makes a huge difference to speed (20% faster)
 $(BUILD)/vm.o: $(PYSRC)/vm.c
-	$(CC) $(CFLAGS) $(CFLAGS_PY) -O3 -c -o $@ $<
+	$(CC) $(CFLAGS) -O3 -c -o $@ $<
 
+$(BUILD)/main.o: mpyconfig.h
 $(BUILD)/parse.o: $(PYSRC)/grammar.h
 $(BUILD)/compile.o: $(PYSRC)/grammar.h
 $(BUILD)/emitcpy.o: $(PYSRC)/emit.h
diff --git a/unix/main.c b/unix/main.c
index eb120da01937e4567d017730e9e6367cbbaf7510..8ceaf42646fef626a945283f5cdd815a1901180d 100644
--- a/unix/main.c
+++ b/unix/main.c
@@ -2,6 +2,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include "nlr.h"
 #include "misc.h"
 #include "mpyconfig.h"
 #include "lexer.h"
@@ -47,10 +48,19 @@ int main(int argc, char **argv) {
         // execute it
         py_obj_t module_fun = rt_make_function_from_id(1);
         if (module_fun != py_const_none) {
-            py_obj_t ret = rt_call_function_0(module_fun);
-            printf("done! got: ");
-            py_obj_print(ret);
-            printf("\n");
+            nlr_buf_t nlr;
+            if (nlr_push(&nlr) == 0) {
+                py_obj_t ret = rt_call_function_0(module_fun);
+                printf("done! got: ");
+                py_obj_print(ret);
+                printf("\n");
+                nlr_pop();
+            } else {
+                // uncaught exception
+                printf("exception: ");
+                py_obj_print((py_obj_t)nlr.ret_val);
+                printf("\n");
+            }
         }
     }
 #endif