diff --git a/py/compile.c b/py/compile.c
index df416b87f4bba3efaec5eec43ba179f501ec1679..8ef05d2388705bd89d29fd573572ecbb7bc58f5b 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -170,6 +170,16 @@ STATIC uint comp_next_label(compiler_t *comp) {
     return comp->next_label++;
 }
 
+#if MICROPY_EMIT_NATIVE
+STATIC void reserve_labels_for_native(compiler_t *comp, int n) {
+    if (comp->scope_cur->emit_options != MP_EMIT_OPT_BYTECODE) {
+        comp->next_label += n;
+    }
+}
+#else
+#define reserve_labels_for_native(comp, n)
+#endif
+
 STATIC void compile_increase_except_level(compiler_t *comp) {
     comp->cur_except_level += 1;
     if (comp->cur_except_level > comp->scope_cur->exc_stack_size) {
@@ -1656,11 +1666,6 @@ STATIC void compile_with_stmt_helper(compiler_t *comp, int n, mp_parse_node_t *n
         compile_node(comp, body);
     } else {
         uint l_end = comp_next_label(comp);
-        if (MICROPY_EMIT_NATIVE && comp->scope_cur->emit_options != MP_EMIT_OPT_BYTECODE) {
-            // we need to allocate an extra label for the native emitter
-            // it will use l_end+1 as an auxiliary label
-            comp_next_label(comp);
-        }
         if (MP_PARSE_NODE_IS_STRUCT_KIND(nodes[0], PN_with_item)) {
             // this pre-bit is of the form "a as b"
             mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)nodes[0];
@@ -1678,6 +1683,7 @@ STATIC void compile_with_stmt_helper(compiler_t *comp, int n, mp_parse_node_t *n
         compile_with_stmt_helper(comp, n - 1, nodes + 1, body);
         // finish this with block
         EMIT_ARG(with_cleanup, l_end);
+        reserve_labels_for_native(comp, 2); // used by native's with_cleanup
         compile_decrease_except_level(comp);
         EMIT(end_finally);
     }
@@ -2947,6 +2953,7 @@ STATIC void compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
     comp->scope_cur = scope;
     comp->next_label = 0;
     EMIT_ARG(start_pass, pass, scope);
+    reserve_labels_for_native(comp, 4); // used by native's start_pass
 
     if (comp->pass == MP_PASS_SCOPE) {
         // reset maximum stack sizes in scope
@@ -3443,7 +3450,7 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
                 case MP_EMIT_OPT_NATIVE_PYTHON:
                 case MP_EMIT_OPT_VIPER:
                     if (emit_native == NULL) {
-                        emit_native = NATIVE_EMITTER(new)(&comp->compile_error, max_num_labels);
+                        emit_native = NATIVE_EMITTER(new)(&comp->compile_error, &comp->next_label, max_num_labels);
                     }
                     comp->emit_method_table = &NATIVE_EMITTER(method_table);
                     comp->emit = emit_native;
diff --git a/py/emit.h b/py/emit.h
index aa98efa774398b70aeb30ee30cfc8bab2f8656dd..e9980b58520a4c5b3ee1c7972a7d5d071df54b08 100644
--- a/py/emit.h
+++ b/py/emit.h
@@ -178,11 +178,11 @@ extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_store_id_ops;
 extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_delete_id_ops;
 
 emit_t *emit_bc_new(void);
-emit_t *emit_native_x64_new(mp_obj_t *error_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_x86_new(mp_obj_t *error_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_thumb_new(mp_obj_t *error_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_arm_new(mp_obj_t *error_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_xtensa_new(mp_obj_t *error_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_x64_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_x86_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_thumb_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_arm_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_xtensa_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
 
 void emit_bc_set_max_num_labels(emit_t* emit, mp_uint_t max_num_labels);
 
diff --git a/py/emitnarm.c b/py/emitnarm.c
index 1b585f821b4a61873cb6495bfc3915328d8c1a0e..89467052cbe9cdd8e36cf2b268beb3683d59c055 100644
--- a/py/emitnarm.c
+++ b/py/emitnarm.c
@@ -8,6 +8,9 @@
 #define GENERIC_ASM_API (1)
 #include "py/asmarm.h"
 
+// Word index of REG_LOCAL_1(=r4) in nlr_buf_t
+#define NLR_BUF_IDX_LOCAL_1 (3)
+
 #define N_ARM (1)
 #define EXPORT_FUN(name) emit_native_arm_##name
 #include "py/emitnative.c"
diff --git a/py/emitnative.c b/py/emitnative.c
index 1b1e79c9dbec307a0e359cceb3dd684f9655d814..6756e50efb42b398fd0506b12287770a723a4ae3 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -59,6 +59,34 @@
 // wrapper around everything in this file
 #if N_X64 || N_X86 || N_THUMB || N_ARM || N_XTENSA
 
+// C stack layout for native functions:
+//  0:                      mp_code_state_t
+//  emit->stack_start:      nlr_buf_t [optional]            |
+//                          Python object stack             | emit->n_state
+//                          locals (reversed, L0 at end)    |
+//
+// C stack layout for viper functions:
+//  0 = emit->stack_start:  nlr_buf_t [optional]            |
+//                          Python object stack             | emit->n_state
+//                          locals (reversed, L0 at end)    |
+//                          (L0-L2 may be in regs instead)
+
+// Word index of nlr_buf_t.ret_val
+#define NLR_BUF_IDX_RET_VAL (1)
+
+// Whether the native/viper function needs to be wrapped in an exception handler
+#define NEED_GLOBAL_EXC_HANDLER(emit) ((emit)->scope->exc_stack_size > 0)
+
+// Whether registers can be used to store locals (only true if there are no
+// exception handlers, because otherwise an nlr_jump will restore registers to
+// their state at the start of the function and updates to locals will be lost)
+#define CAN_USE_REGS_FOR_LOCALS(emit) ((emit)->scope->exc_stack_size == 0)
+
+// Indices within the local C stack for various variables
+#define LOCAL_IDX_EXC_VAL(emit) ((emit)->stack_start + NLR_BUF_IDX_RET_VAL)
+#define LOCAL_IDX_EXC_HANDLER_PC(emit) ((emit)->stack_start + NLR_BUF_IDX_LOCAL_1)
+#define LOCAL_IDX_LOCAL_VAR(emit, local_num) ((emit)->stack_start + (emit)->n_state - 1 - (local_num))
+
 // number of arguments to viper functions are limited to this value
 #define REG_ARG_NUM (4)
 
@@ -120,8 +148,15 @@ typedef struct _stack_info_t {
     } data;
 } stack_info_t;
 
+typedef struct _exc_stack_entry_t {
+    uint16_t label : 15;
+    uint16_t is_finally : 1;
+} exc_stack_entry_t;
+
 struct _emit_t {
     mp_obj_t *error_slot;
+    uint *label_slot;
+    uint exit_label;
     int pass;
 
     bool do_viper_types;
@@ -135,6 +170,10 @@ struct _emit_t {
     stack_info_t *stack_info;
     vtype_kind_t saved_stack_vtype;
 
+    size_t exc_stack_alloc;
+    size_t exc_stack_size;
+    exc_stack_entry_t *exc_stack;
+
     int prelude_offset;
     int const_table_offset;
     int n_state;
@@ -151,11 +190,17 @@ struct _emit_t {
 STATIC const uint8_t reg_arg_table[REG_ARG_NUM] = {REG_ARG_1, REG_ARG_2, REG_ARG_3, REG_ARG_4};
 STATIC const uint8_t reg_local_table[REG_LOCAL_NUM] = {REG_LOCAL_1, REG_LOCAL_2, REG_LOCAL_3};
 
-emit_t *EXPORT_FUN(new)(mp_obj_t *error_slot, mp_uint_t max_num_labels) {
+STATIC void emit_native_global_exc_entry(emit_t *emit);
+STATIC void emit_native_global_exc_exit(emit_t *emit);
+
+emit_t *EXPORT_FUN(new)(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels) {
     emit_t *emit = m_new0(emit_t, 1);
     emit->error_slot = error_slot;
+    emit->label_slot = label_slot;
     emit->stack_info_alloc = 8;
     emit->stack_info = m_new(stack_info_t, emit->stack_info_alloc);
+    emit->exc_stack_alloc = 8;
+    emit->exc_stack = m_new(exc_stack_entry_t, emit->exc_stack_alloc);
     emit->as = m_new0(ASM_T, 1);
     mp_asm_base_init(&emit->as->base, max_num_labels);
     return emit;
@@ -164,6 +209,7 @@ emit_t *EXPORT_FUN(new)(mp_obj_t *error_slot, mp_uint_t max_num_labels) {
 void EXPORT_FUN(free)(emit_t *emit) {
     mp_asm_base_deinit(&emit->as->base, false);
     m_del_obj(ASM_T, emit->as);
+    m_del(exc_stack_entry_t, emit->exc_stack, emit->exc_stack_alloc);
     m_del(vtype_kind_t, emit->local_vtype, emit->local_vtype_alloc);
     m_del(stack_info_t, emit->stack_info, emit->stack_info_alloc);
     m_del_obj(emit_t, emit);
@@ -204,8 +250,6 @@ STATIC void emit_post_push_reg(emit_t *emit, vtype_kind_t vtype, int reg);
 STATIC void emit_native_load_fast(emit_t *emit, qstr qst, mp_uint_t local_num);
 STATIC void emit_native_store_fast(emit_t *emit, qstr qst, mp_uint_t local_num);
 
-#define STATE_START (sizeof(mp_code_state_t) / sizeof(mp_uint_t))
-
 STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
     DEBUG_printf("start_pass(pass=%u, scope=%p)\n", pass, scope);
 
@@ -259,17 +303,22 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
             return;
         }
 
-        // entry to function
-        int num_locals = 0;
-        if (pass > MP_PASS_SCOPE) {
-            num_locals = scope->num_locals - REG_LOCAL_NUM;
-            if (num_locals < 0) {
-                num_locals = 0;
+        // Work out size of state (locals plus stack)
+        // n_state counts all stack and locals, even those in registers
+        emit->n_state = scope->num_locals + scope->stack_size;
+        int num_locals_in_regs = 0;
+        if (CAN_USE_REGS_FOR_LOCALS(emit)) {
+            num_locals_in_regs = scope->num_locals;
+            if (num_locals_in_regs > REG_LOCAL_NUM) {
+                num_locals_in_regs = REG_LOCAL_NUM;
             }
-            emit->stack_start = num_locals;
-            num_locals += scope->stack_size;
         }
-        ASM_ENTRY(emit->as, num_locals);
+
+        // The locals and stack start at the beginning of the C stack
+        emit->stack_start = 0;
+
+        // Entry to function
+        ASM_ENTRY(emit->as, emit->stack_start + emit->n_state - num_locals_in_regs);
 
         // TODO don't load r7 if we don't need it
         #if N_THUMB
@@ -278,35 +327,38 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         asm_arm_mov_reg_i32(emit->as, ASM_ARM_REG_R7, (mp_uint_t)mp_fun_table);
         #endif
 
+        // Store arguments into locals
         #if N_X86
         for (int i = 0; i < scope->num_pos_args; i++) {
-            if (i < REG_LOCAL_NUM) {
+            if (i < REG_LOCAL_NUM && CAN_USE_REGS_FOR_LOCALS(emit)) {
                 asm_x86_mov_arg_to_r32(emit->as, i, reg_local_table[i]);
             } else {
                 asm_x86_mov_arg_to_r32(emit->as, i, REG_TEMP0);
-                asm_x86_mov_r32_to_local(emit->as, REG_TEMP0, i - REG_LOCAL_NUM);
+                asm_x86_mov_r32_to_local(emit->as, REG_TEMP0, LOCAL_IDX_LOCAL_VAR(emit, i));
             }
         }
         #else
         for (int i = 0; i < scope->num_pos_args; i++) {
-            if (i < REG_LOCAL_NUM) {
+            if (i < REG_LOCAL_NUM && CAN_USE_REGS_FOR_LOCALS(emit)) {
                 ASM_MOV_REG_REG(emit->as, reg_local_table[i], reg_arg_table[i]);
             } else {
                 assert(i < REG_ARG_NUM); // should be true; max args is checked above
-                ASM_MOV_LOCAL_REG(emit->as, i - REG_LOCAL_NUM, reg_arg_table[i]);
+                ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_LOCAL_VAR(emit, i), reg_arg_table[i]);
             }
         }
         #endif
 
+        emit_native_global_exc_entry(emit);
+
     } else {
         // work out size of state (locals plus stack)
         emit->n_state = scope->num_locals + scope->stack_size;
 
         // the locals and stack start after the code_state structure
-        emit->stack_start = STATE_START;
+        emit->stack_start = sizeof(mp_code_state_t) / sizeof(mp_uint_t);
 
         // allocate space on C-stack for code_state structure, which includes state
-        ASM_ENTRY(emit->as, STATE_START + emit->n_state);
+        ASM_ENTRY(emit->as, emit->stack_start + emit->n_state);
 
         // TODO don't load r7 if we don't need it
         #if N_THUMB
@@ -343,9 +395,13 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         ASM_CALL_IND(emit->as, mp_fun_table[MP_F_SETUP_CODE_STATE], MP_F_SETUP_CODE_STATE);
         #endif
 
-        // cache some locals in registers
-        for (int i = 0; i < REG_LOCAL_NUM && i < scope->num_locals; ++i) {
-            ASM_MOV_REG_LOCAL(emit->as, reg_local_table[i], STATE_START + emit->n_state - 1 - i);
+        emit_native_global_exc_entry(emit);
+
+        // cache some locals in registers, but only if no exception handlers
+        if (CAN_USE_REGS_FOR_LOCALS(emit)) {
+            for (int i = 0; i < REG_LOCAL_NUM && i < scope->num_locals; ++i) {
+                ASM_MOV_REG_LOCAL(emit->as, reg_local_table[i], LOCAL_IDX_LOCAL_VAR(emit, i));
+            }
         }
 
         // set the type of closed over variables
@@ -360,9 +416,7 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
 }
 
 STATIC void emit_native_end_pass(emit_t *emit) {
-    if (!emit->last_emit_was_return_value) {
-        ASM_EXIT(emit->as);
-    }
+    emit_native_global_exc_exit(emit);
 
     if (!emit->do_viper_types) {
         emit->prelude_offset = mp_asm_base_get_code_pos(&emit->as->base);
@@ -418,6 +472,7 @@ STATIC void emit_native_end_pass(emit_t *emit) {
 
     // check stack is back to zero size
     assert(emit->stack_size == 0);
+    assert(emit->exc_stack_size == 0);
 
     if (emit->pass == MP_PASS_EMIT) {
         void *f = mp_asm_base_get_code(&emit->as->base);
@@ -778,13 +833,122 @@ STATIC void emit_get_stack_pointer_to_reg_for_push(emit_t *emit, mp_uint_t reg_d
     adjust_stack(emit, n_push);
 }
 
+STATIC void emit_native_push_exc_stack(emit_t *emit, uint label, bool is_finally) {
+    if (emit->exc_stack_size + 1 > emit->exc_stack_alloc) {
+        size_t new_alloc = emit->exc_stack_alloc + 4;
+        emit->exc_stack = m_renew(exc_stack_entry_t, emit->exc_stack, emit->exc_stack_alloc, new_alloc);
+        emit->exc_stack_alloc = new_alloc;
+    }
+
+    exc_stack_entry_t *e = &emit->exc_stack[emit->exc_stack_size++];
+    e->label = label;
+    e->is_finally = is_finally;
+
+    ASM_MOV_REG_PCREL(emit->as, REG_RET, label);
+    ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_EXC_HANDLER_PC(emit), REG_RET);
+}
+
+STATIC void emit_native_pop_exc_stack(emit_t *emit, bool do_pop) {
+    assert(emit->exc_stack_size > 0);
+    if (emit->exc_stack_size == 1) {
+        if (do_pop) {
+            --emit->exc_stack_size;
+            return;
+        }
+        ASM_XOR_REG_REG(emit->as, REG_RET, REG_RET);
+    } else {
+        uint label = emit->exc_stack[emit->exc_stack_size - 2].label;
+        ASM_MOV_REG_PCREL(emit->as, REG_RET, label);
+    }
+    ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_EXC_HANDLER_PC(emit), REG_RET);
+    if (do_pop) {
+        --emit->exc_stack_size;
+    }
+}
+
 STATIC void emit_native_label_assign(emit_t *emit, mp_uint_t l) {
     DEBUG_printf("label_assign(" UINT_FMT ")\n", l);
+
+    bool is_finally = false;
+    if (emit->exc_stack_size > 0) {
+       exc_stack_entry_t *e = &emit->exc_stack[emit->exc_stack_size - 1];
+       is_finally = e->is_finally && e->label == l;
+    }
+
+    if (is_finally) {
+        // Label is at start of finally handler: store TOS into exception slot
+        vtype_kind_t vtype;
+        emit_pre_pop_reg(emit, &vtype, REG_TEMP0);
+        ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_EXC_VAL(emit), REG_TEMP0);
+    }
+
     emit_native_pre(emit);
     // need to commit stack because we can jump here from elsewhere
     need_stack_settled(emit);
     mp_asm_base_label_assign(&emit->as->base, l);
     emit_post(emit);
+
+    if (is_finally) {
+        // Label is at start of finally handler: pop exception stack
+        emit_native_pop_exc_stack(emit, true);
+    }
+}
+
+STATIC void emit_native_global_exc_entry(emit_t *emit) {
+    // Note: 4 labels are reserved for this function, starting at *emit->label_slot
+
+    emit->exit_label = *emit->label_slot;
+
+    if (NEED_GLOBAL_EXC_HANDLER(emit)) {
+        mp_uint_t nlr_label = *emit->label_slot + 1;
+        mp_uint_t start_label = *emit->label_slot + 2;
+        mp_uint_t global_except_label = *emit->label_slot + 3;
+
+        // Put PC of start code block into REG_LOCAL_1
+        ASM_MOV_REG_PCREL(emit->as, REG_LOCAL_1, start_label);
+
+        // Wrap everything in an nlr context
+        emit_native_label_assign(emit, nlr_label);
+        emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_1, sizeof(nlr_buf_t) / sizeof(uintptr_t));
+        emit_call(emit, MP_F_NLR_PUSH);
+        ASM_JUMP_IF_REG_NONZERO(emit->as, REG_RET, global_except_label, true);
+
+        // Clear PC of current code block, and jump there to resume execution
+        ASM_XOR_REG_REG(emit->as, REG_TEMP0, REG_TEMP0);
+        ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_EXC_HANDLER_PC(emit), REG_TEMP0);
+        ASM_JUMP_REG(emit->as, REG_LOCAL_1);
+
+        // Global exception handler: check for valid exception handler
+        emit_native_label_assign(emit, global_except_label);
+        ASM_MOV_REG_LOCAL(emit->as, REG_LOCAL_1, LOCAL_IDX_EXC_HANDLER_PC(emit));
+        ASM_JUMP_IF_REG_NONZERO(emit->as, REG_LOCAL_1, nlr_label, false);
+
+        // Re-raise exception out to caller
+        ASM_MOV_REG_LOCAL(emit->as, REG_ARG_1, LOCAL_IDX_EXC_VAL(emit));
+        emit_call(emit, MP_F_NATIVE_RAISE);
+
+        // Label for start of function
+        emit_native_label_assign(emit, start_label);
+    }
+}
+
+STATIC void emit_native_global_exc_exit(emit_t *emit) {
+    // Label for end of function
+    emit_native_label_assign(emit, emit->exit_label);
+
+    if (NEED_GLOBAL_EXC_HANDLER(emit)) {
+        // Save return value
+        ASM_MOV_REG_REG(emit->as, REG_LOCAL_1, REG_RET);
+
+        // Pop the nlr context
+        emit_call(emit, MP_F_NLR_POP);
+        adjust_stack(emit, -(mp_int_t)(sizeof(nlr_buf_t) / sizeof(uintptr_t)));
+
+        // Restore return value
+        ASM_MOV_REG_REG(emit->as, REG_RET, REG_LOCAL_1);
+    }
+
+    ASM_EXIT(emit->as);
 }
 
 STATIC void emit_native_import_name(emit_t *emit, qstr qst) {
@@ -923,15 +1087,11 @@ STATIC void emit_native_load_fast(emit_t *emit, qstr qst, mp_uint_t local_num) {
         EMIT_NATIVE_VIPER_TYPE_ERROR(emit, "local '%q' used before type known", qst);
     }
     emit_native_pre(emit);
-    if (local_num < REG_LOCAL_NUM) {
+    if (local_num < REG_LOCAL_NUM && CAN_USE_REGS_FOR_LOCALS(emit)) {
         emit_post_push_reg(emit, vtype, reg_local_table[local_num]);
     } else {
         need_reg_single(emit, REG_TEMP0, 0);
-        if (emit->do_viper_types) {
-            ASM_MOV_REG_LOCAL(emit->as, REG_TEMP0, local_num - REG_LOCAL_NUM);
-        } else {
-            ASM_MOV_REG_LOCAL(emit->as, REG_TEMP0, STATE_START + emit->n_state - 1 - local_num);
-        }
+        ASM_MOV_REG_LOCAL(emit->as, REG_TEMP0, LOCAL_IDX_LOCAL_VAR(emit, local_num));
         emit_post_push_reg(emit, vtype, REG_TEMP0);
     }
 }
@@ -1160,15 +1320,11 @@ STATIC void emit_native_load_subscr(emit_t *emit) {
 
 STATIC void emit_native_store_fast(emit_t *emit, qstr qst, mp_uint_t local_num) {
     vtype_kind_t vtype;
-    if (local_num < REG_LOCAL_NUM) {
+    if (local_num < REG_LOCAL_NUM && CAN_USE_REGS_FOR_LOCALS(emit)) {
         emit_pre_pop_reg(emit, &vtype, reg_local_table[local_num]);
     } else {
         emit_pre_pop_reg(emit, &vtype, REG_TEMP0);
-        if (emit->do_viper_types) {
-            ASM_MOV_LOCAL_REG(emit->as, local_num - REG_LOCAL_NUM, REG_TEMP0);
-        } else {
-            ASM_MOV_LOCAL_REG(emit->as, STATE_START + emit->n_state - 1 - local_num, REG_TEMP0);
-        }
+        ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_LOCAL_VAR(emit, local_num), REG_TEMP0);
     }
     emit_post(emit);
 
@@ -1601,13 +1757,10 @@ STATIC void emit_native_setup_with(emit_t *emit, mp_uint_t label) {
 
     // need to commit stack because we may jump elsewhere
     need_stack_settled(emit);
-    emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_1, sizeof(nlr_buf_t) / sizeof(mp_uint_t)); // arg1 = pointer to nlr buf
-    emit_call(emit, MP_F_NLR_PUSH);
-    ASM_JUMP_IF_REG_NONZERO(emit->as, REG_RET, label, true);
+    emit_native_push_exc_stack(emit, label, false);
 
-    emit_access_stack(emit, sizeof(nlr_buf_t) / sizeof(mp_uint_t) + 1, &vtype, REG_RET); // access return value of __enter__
-    emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); // push return value of __enter__
-    // stack: (..., __exit__, self, as_value, nlr_buf, as_value)
+    emit_native_dup_top(emit);
+    // stack: (..., __exit__, self, as_value, as_value)
 }
 
 STATIC void emit_native_setup_block(emit_t *emit, mp_uint_t label, int kind) {
@@ -1616,22 +1769,19 @@ STATIC void emit_native_setup_block(emit_t *emit, mp_uint_t label, int kind) {
     } else {
         // Set up except and finally
         emit_native_pre(emit);
-        // need to commit stack because we may jump elsewhere
         need_stack_settled(emit);
-        emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_1, sizeof(nlr_buf_t) / sizeof(mp_uint_t)); // arg1 = pointer to nlr buf
-        emit_call(emit, MP_F_NLR_PUSH);
-        ASM_JUMP_IF_REG_NONZERO(emit->as, REG_RET, label, true);
+        emit_native_push_exc_stack(emit, label, kind == MP_EMIT_SETUP_BLOCK_FINALLY);
         emit_post(emit);
     }
 }
 
 STATIC void emit_native_with_cleanup(emit_t *emit, mp_uint_t label) {
-    // note: label+1 is available as an auxiliary label
+    // Note: 2 labels are reserved for this function, starting at *emit->label_slot
 
-    // stack: (..., __exit__, self, as_value, nlr_buf)
+    // stack: (..., __exit__, self, as_value)
     emit_native_pre(emit);
-    emit_call(emit, MP_F_NLR_POP);
-    adjust_stack(emit, -(mp_int_t)(sizeof(nlr_buf_t) / sizeof(mp_uint_t)) - 1);
+    emit_native_pop_exc_stack(emit, false);
+    adjust_stack(emit, -1);
     // stack: (..., __exit__, self)
 
     // call __exit__
@@ -1641,57 +1791,47 @@ STATIC void emit_native_with_cleanup(emit_t *emit, mp_uint_t label) {
     emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, 5);
     emit_call_with_2_imm_args(emit, MP_F_CALL_METHOD_N_KW, 3, REG_ARG_1, 0, REG_ARG_2);
 
-    // jump to after with cleanup nlr_catch block
-    adjust_stack(emit, 1); // dummy nlr_buf.prev
-    emit_native_load_const_tok(emit, MP_TOKEN_KW_NONE); // nlr_buf.ret_val = no exception
-    emit_native_jump(emit, label + 1);
+    // Replace exc with None and finish
+    emit_native_jump(emit, *emit->label_slot);
 
     // nlr_catch
     emit_native_label_assign(emit, label);
 
-    // adjust stack counter for: __exit__, self, as_value
-    adjust_stack(emit, 3);
-    // stack: (..., __exit__, self, as_value, nlr_buf.prev, nlr_buf.ret_val)
+    // Pop with's exception handler
+    emit_native_pop_exc_stack(emit, true);
 
-    vtype_kind_t vtype;
-    emit_pre_pop_reg(emit, &vtype, REG_ARG_1); // get the thrown value (exc)
-    adjust_stack(emit, -2); // discard nlr_buf.prev and as_value
+    // Adjust stack counter for: __exit__, self (implicitly discard as_value which is above self)
+    emit_native_adjust_stack_size(emit, 2);
     // stack: (..., __exit__, self)
-    // REG_ARG_1=exc
-
-    emit_pre_pop_reg(emit, &vtype, REG_ARG_2); // self
-    emit_pre_pop_reg(emit, &vtype, REG_ARG_3); // __exit__
-    adjust_stack(emit, 1); // dummy nlr_buf.prev
-    emit_post_push_reg(emit, vtype, REG_ARG_1); // push exc to save it for later
-    emit_post_push_reg(emit, vtype, REG_ARG_3); // __exit__
-    emit_post_push_reg(emit, vtype, REG_ARG_2); // self
-    // stack: (..., exc, __exit__, self)
-    // REG_ARG_1=exc
 
+    ASM_MOV_REG_LOCAL(emit->as, REG_ARG_1, LOCAL_IDX_EXC_VAL(emit)); // get exc
     ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_2, REG_ARG_1, 0); // get type(exc)
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_ARG_2); // push type(exc)
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_ARG_1); // push exc value
     emit_post_push_imm(emit, VTYPE_PYOBJ, (mp_uint_t)mp_const_none); // traceback info
-    // stack: (..., exc, __exit__, self, type(exc), exc, traceback)
+    // Stack: (..., __exit__, self, type(exc), exc, traceback)
 
     // call __exit__ method
     emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, 5);
     emit_call_with_2_imm_args(emit, MP_F_CALL_METHOD_N_KW, 3, REG_ARG_1, 0, REG_ARG_2);
-    // stack: (..., exc)
+    // Stack: (...)
 
-    // if REG_RET is true then we need to replace top-of-stack with None (swallow exception)
+    // If REG_RET is true then we need to replace exception with None (swallow exception)
     if (REG_ARG_1 != REG_RET) {
         ASM_MOV_REG_REG(emit->as, REG_ARG_1, REG_RET);
     }
     emit_call(emit, MP_F_OBJ_IS_TRUE);
-    ASM_JUMP_IF_REG_ZERO(emit->as, REG_RET, label + 1, true);
+    ASM_JUMP_IF_REG_ZERO(emit->as, REG_RET, *emit->label_slot + 1, true);
 
-    // replace exc with None
-    emit_pre_pop_discard(emit);
-    emit_post_push_imm(emit, VTYPE_PYOBJ, (mp_uint_t)mp_const_none);
+    // Replace exception with None
+    emit_native_label_assign(emit, *emit->label_slot);
+    ASM_MOV_REG_IMM(emit->as, REG_TEMP0, (mp_uint_t)mp_const_none);
+    ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_EXC_VAL(emit), REG_TEMP0);
 
     // end of with cleanup nlr_catch block
-    emit_native_label_assign(emit, label + 1);
+    emit_native_label_assign(emit, *emit->label_slot + 1);
+
+    // Exception is in nlr_buf.ret_val slot
 }
 
 STATIC void emit_native_end_finally(emit_t *emit) {
@@ -1700,9 +1840,8 @@ STATIC void emit_native_end_finally(emit_t *emit) {
     //   if exc == None: pass
     //   else: raise exc
     // the check if exc is None is done in the MP_F_NATIVE_RAISE stub
-    vtype_kind_t vtype;
-    emit_pre_pop_reg(emit, &vtype, REG_ARG_1); // get nlr_buf.ret_val
-    emit_pre_pop_discard(emit); // discard nlr_buf.prev
+    emit_native_pre(emit);
+    ASM_MOV_REG_LOCAL(emit->as, REG_ARG_1, LOCAL_IDX_EXC_VAL(emit));
     emit_call(emit, MP_F_NATIVE_RAISE);
     emit_post(emit);
 }
@@ -1749,8 +1888,9 @@ STATIC void emit_native_for_iter_end(emit_t *emit) {
 
 STATIC void emit_native_pop_block(emit_t *emit) {
     emit_native_pre(emit);
-    emit_call(emit, MP_F_NLR_POP);
-    adjust_stack(emit, -(mp_int_t)(sizeof(nlr_buf_t) / sizeof(mp_uint_t)) + 1);
+    if (!emit->exc_stack[emit->exc_stack_size - 1].is_finally) {
+        emit_native_pop_exc_stack(emit, false);
+    }
     emit_post(emit);
 }
 
@@ -2176,7 +2316,7 @@ STATIC void emit_native_return_value(emit_t *emit) {
         assert(vtype == VTYPE_PYOBJ);
     }
     emit->last_emit_was_return_value = true;
-    ASM_EXIT(emit->as);
+    ASM_JUMP(emit->as, emit->exit_label);
 }
 
 STATIC void emit_native_raise_varargs(emit_t *emit, mp_uint_t n_args) {
@@ -2198,14 +2338,16 @@ STATIC void emit_native_yield(emit_t *emit, int kind) {
 }
 
 STATIC void emit_native_start_except_handler(emit_t *emit) {
-    // This instruction follows a pop_block call, so the stack counter is up by one when really
-    // it should be up by a whole nlr_buf_t.  We then want to pop the nlr_buf_t here, but save
-    // the first 2 elements, so we can get the thrown value.
-    adjust_stack(emit, 1);
+    // Protected block has finished so pop the exception stack
+    emit_native_pop_exc_stack(emit, true);
+
+    // Get and push nlr_buf.ret_val
+    ASM_MOV_REG_LOCAL(emit->as, REG_TEMP0, LOCAL_IDX_EXC_VAL(emit));
+    emit_post_push_reg(emit, VTYPE_PYOBJ, REG_TEMP0);
 }
 
 STATIC void emit_native_end_except_handler(emit_t *emit) {
-    (void)emit;
+    adjust_stack(emit, -1); // pop the exception (end_finally didn't use it)
 }
 
 const emit_method_table_t EXPORT_FUN(method_table) = {
diff --git a/py/emitnthumb.c b/py/emitnthumb.c
index 2b68ca3a13f7157b412e41382599262bcfd13658..e1dc4976d7382ac2a4ba4daa5562e28f5083a3d9 100644
--- a/py/emitnthumb.c
+++ b/py/emitnthumb.c
@@ -8,6 +8,9 @@
 #define GENERIC_ASM_API (1)
 #include "py/asmthumb.h"
 
+// Word index of REG_LOCAL_1(=r4) in nlr_buf_t
+#define NLR_BUF_IDX_LOCAL_1 (3)
+
 #define N_THUMB (1)
 #define EXPORT_FUN(name) emit_native_thumb_##name
 #include "py/emitnative.c"
diff --git a/py/emitnx64.c b/py/emitnx64.c
index b9800f636e087785843841de9bf6b9ebb0a5084e..5b04a50f546dd74a117f36d3b269c40885e6ea8a 100644
--- a/py/emitnx64.c
+++ b/py/emitnx64.c
@@ -8,6 +8,9 @@
 #define GENERIC_ASM_API (1)
 #include "py/asmx64.h"
 
+// Word index of REG_LOCAL_1(=rbx) in nlr_buf_t
+#define NLR_BUF_IDX_LOCAL_1 (5)
+
 #define N_X64 (1)
 #define EXPORT_FUN(name) emit_native_x64_##name
 #include "py/emitnative.c"
diff --git a/py/emitnx86.c b/py/emitnx86.c
index 5d2bbb267a1661fbc899a8120c2f200a149902d5..4c192069d845e2e8ecdc684bb6154ea0c235fb10 100644
--- a/py/emitnx86.c
+++ b/py/emitnx86.c
@@ -9,6 +9,9 @@
 #define GENERIC_ASM_API (1)
 #include "py/asmx86.h"
 
+// Word index of REG_LOCAL_1(=ebx) in nlr_buf_t
+#define NLR_BUF_IDX_LOCAL_1 (5)
+
 // x86 needs a table to know how many args a given function has
 STATIC byte mp_f_n_args[MP_F_NUMBER_OF] = {
     [MP_F_CONVERT_OBJ_TO_NATIVE] = 2,
diff --git a/py/emitnxtensa.c b/py/emitnxtensa.c
index 1a423e21eba3e8eb87762df14d7ee4f1bd017c48..89ecb34de56954f4e10b4356ab077455fb4e9bb5 100644
--- a/py/emitnxtensa.c
+++ b/py/emitnxtensa.c
@@ -8,6 +8,9 @@
 #define GENERIC_ASM_API (1)
 #include "py/asmxtensa.h"
 
+// Word index of REG_LOCAL_1(=a12) in nlr_buf_t
+#define NLR_BUF_IDX_LOCAL_1 (8)
+
 #define N_XTENSA (1)
 #define EXPORT_FUN(name) emit_native_xtensa_##name
 #include "py/emitnative.c"