diff --git a/py/nlr.h b/py/nlr.h
index b824e4a63a3a58e3ac3caa3a15f02522f41dbb96..fa4e2f45f0ba616908258aa7001e5169c1aa20d0 100644
--- a/py/nlr.h
+++ b/py/nlr.h
@@ -3,9 +3,9 @@
 
 #include <limits.h>
 
-#ifndef __WORDSIZE
-#error __WORDSIZE needs to be defined
-#endif
+//#ifndef __WORDSIZE
+//#error __WORDSIZE needs to be defined
+//#endif
 
 typedef struct _nlr_buf_t nlr_buf_t;
 struct _nlr_buf_t {
@@ -17,7 +17,9 @@ struct _nlr_buf_t {
 #elif __WORDSIZE == 64
     void *regs[8];
 #else
-#error Unsupported __WORDSIZE
+    // hack for thumb
+    void *regs[10];
+//#error Unsupported __WORDSIZE
 #endif
 };
 
diff --git a/py/nlrthumb.s b/py/nlrthumb.s
new file mode 100644
index 0000000000000000000000000000000000000000..d4d1bff233502f5aaff97fe19146cc0f1e63832f
--- /dev/null
+++ b/py/nlrthumb.s
@@ -0,0 +1,91 @@
+@ thumb callee save: bx, bp, sp, r12, r14, r14, r15
+
+    .syntax unified
+    .cpu cortex-m4
+    .thumb
+    .text
+    .align  2
+
+@ uint nlr_push(r0=nlr_buf_t *nlr)
+    .global nlr_push
+    .thumb
+    .thumb_func
+    .type   nlr_push, %function
+nlr_push:
+    str     lr, [r0, #8]            @ store lr into nlr_buf
+    str     r4, [r0, #12]           @ store r4 into nlr_buf
+    str     r5, [r0, #16]           @ store r5 into nlr_buf
+    str     r6, [r0, #20]           @ store r6 into nlr_buf
+    str     r7, [r0, #24]           @ store r7 into nlr_buf
+    str     r8, [r0, #28]           @ store r8 into nlr_buf
+    str     r9, [r0, #32]           @ store r9 into nlr_buf
+    str     r10, [r0, #36]          @ store r10 into nlr_buf
+    str     r11, [r0, #40]          @ store r11 into nlr_buf
+    str     r13, [r0, #44]          @ store r13=sp into nlr_buf
+
+    ldr     r3, .L2                 @ load addr of nlr_top
+    ldr     r2, [r3]                @ load nlr_top
+    str     r2, [r0]                @ store nlr_top into nlr_buf
+    str     r0, [r3]                @ store nlr_buf into nlr_top (to link list)
+
+    movs    r0, #0                  @ return 0, normal return
+    bx      lr                      @ return
+    .align  2
+.L2:
+    .word   .LANCHOR0
+    .size   nlr_push, .-nlr_push
+
+@ void nlr_pop()
+    .global nlr_pop
+    .thumb
+    .thumb_func
+    .type   nlr_pop, %function
+nlr_pop:
+    ldr     r3, .L5                 @ load addr of nlr_top
+    ldr     r2, [r3]                @ load nlr_top
+    ldr     r2, [r2]                @ load prev nlr_buf
+    str     r2, [r3]                @ store prev nlr_buf to nlr_top (to unlink list)
+    bx      lr                      @ return
+    .align    2
+.L5:
+    .word    .LANCHOR0
+    .size   nlr_pop, .-nlr_pop
+
+@ void nlr_jump(r0=uint val)
+    .global nlr_jump
+    .thumb
+    .thumb_func
+    .type   nlr_jump, %function
+nlr_jump:
+    ldr     r3, .L2                 @ load addr of nlr_top
+    ldr     r2, [r3]                @ load nlr_top
+    str     r0, [r2, #4]            @ store return value
+    ldr     r0, [r2]                @ load prev nlr_buf
+    str     r0, [r3]                @ store prev nol_buf into nlr_top (to unlink list)
+
+    ldr     lr, [r2, #8]            @ load lr from nlr_buf
+    ldr     r4, [r2, #12]           @ load r4 from nlr_buf
+    ldr     r5, [r2, #16]           @ load r5 from nlr_buf
+    ldr     r6, [r2, #20]           @ load r6 from nlr_buf
+    ldr     r7, [r2, #24]           @ load r7 from nlr_buf
+    ldr     r8, [r2, #28]           @ load r8 from nlr_buf
+    ldr     r9, [r2, #32]           @ load r9 from nlr_buf
+    ldr     r10, [r2, #36]          @ load r10 from nlr_buf
+    ldr     r11, [r2, #40]          @ load r11 from nlr_buf
+    ldr     r13, [r2, #44]          @ load r13=sp from nlr_buf
+
+    movs    r0, #1                  @ return 1, non-local return
+    bx      lr                      @ return
+    .align    2
+.L6:
+    .word    .LANCHOR0
+    .size   nlr_jump, .-nlr_jump
+
+@ local variable nlr_top
+    .bss
+    .align  2
+    .set    .LANCHOR0,. + 0
+    .type   nlr_top, %object
+    .size   nlr_top, 4
+nlr_top:
+    .space  4
diff --git a/stm/Makefile b/stm/Makefile
index 23bcffc134bbf7300337c6606304387091a6a102..ee1453da690211443a98e4f5b7cbd4b38bb0d074 100644
--- a/stm/Makefile
+++ b/stm/Makefile
@@ -8,7 +8,6 @@ CC = arm-none-eabi-gcc
 LD = arm-none-eabi-ld
 CFLAGS_CORTEX_M4 = -mthumb -mtune=cortex-m4 -mabi=aapcs-linux -mcpu=cortex-m4 -mfloat-abi=hard -DSTM32F40XX -DHSE_VALUE=8000000
 CFLAGS = -I. -I$(PYSRC) -I$(FATFSSRC) -I$(STMSRC) -Wall -ansi -std=gnu99 -Os -DNDEBUG $(CFLAGS_CORTEX_M4)
-CFLAGS_PY = -DEMIT_ENABLE_THUMB
 LDFLAGS = --nostdlib -T stm32f405.ld
 
 SRC_C = \
@@ -27,7 +26,8 @@ SRC_S = \
 	startup_stm32f40xx.s \
 
 PY_O = \
-#	malloc.o \
+	nlrthumb.o \
+	malloc.o \
 	qstr.o \
 	misc.o \
 	lexer.o \
@@ -109,15 +109,18 @@ $(BUILD)/%.o: $(FATFSSRC)/%.c
 $(BUILD)/%.o: $(STMSRC)/%.c
 	$(CC) $(CFLAGS) -c -o $@ $<
 
+$(BUILD)/%.o: $(PYSRC)/%.s
+	$(AS) -c -o $@ $<
+
 $(BUILD)/%.o: $(PYSRC)/%.c mpyconfig.h
-	$(CC) $(CFLAGS) $(CFLAGS_PY) -c -o $@ $<
+	$(CC) $(CFLAGS) -c -o $@ $<
 
 $(BUILD)/emitnthumb.o: $(PYSRC)/emitnative.c $(PYSRC)/emit.h
-	$(CC) $(CFLAGS) $(CFLAGS_PY) -DN_THUMB -c -o $@ $<
+	$(CC) $(CFLAGS) -DN_THUMB -c -o $@ $<
 
 # optimising vm for speed, adds only a small amount to code size but makes a huge difference to speed (20% faster)
 $(BUILD)/vm.o: $(PYSRC)/vm.c
-	$(CC) $(CFLAGS) $(CFLAGS_PY) -O3 -c -o $@ $<
+	$(CC) $(CFLAGS) -O3 -c -o $@ $<
 
 $(BUILD)/parse.o: $(PYSRC)/grammar.h
 $(BUILD)/compile.o: $(PYSRC)/grammar.h
diff --git a/stm/main.c b/stm/main.c
index 0d761b8cb8199a6198d628a7175d8ddbd8ab327e..9eb31db537a4d97b525489f808a31641820461d4 100644
--- a/stm/main.c
+++ b/stm/main.c
@@ -418,7 +418,6 @@ void __fatal_error(const char *msg) {
 #include "compile.h"
 #include "runtime.h"
 
-/*
 py_obj_t pyb_delay(py_obj_t count) {
     delay_ms(rt_get_int(count));
     return py_const_none;
@@ -436,19 +435,44 @@ py_obj_t pyb_sw() {
         return py_const_false;
     }
 }
-*/
-
-#include "asmthumb.h"
-typedef void (*fun_t)();
 
 #include "ff.h"
 FATFS fatfs0;
 
+#include "nlr.h"
+void g(uint i) {
+    printf("g:%d\n", i);
+    if (i & 1) {
+        nlr_jump((void*)(42 + i));
+    }
+}
+void f() {
+    nlr_buf_t nlr;
+    int i;
+    for (i = 0; i < 4; i++) {
+        printf("f:loop:%d:%p\n", i, &nlr);
+        if (nlr_push(&nlr) == 0) {
+            // normal
+            //printf("a:%p:%p %p %p %u\n", &nlr, nlr.ip, nlr.sp, nlr.prev, nlr.ret_val);
+            g(i);
+            printf("f:lp:%d:nrm\n", i);
+            nlr_pop();
+        } else {
+            // nlr
+            //printf("b:%p:%p %p %p %u\n", &nlr, nlr.ip, nlr.sp, nlr.prev, nlr.ret_val);
+            printf("f:lp:%d:nlr:%d\n", i, (int)nlr.ret_val);
+        }
+    }
+}
+void nlr_test() {
+    f(1);
+}
+
 int main() {
     // should disable JTAG
 
-    //qstr_init();
-    //rt_init();
+    qstr_init();
+    rt_init();
 
     gpio_init();
     led_init();
@@ -503,9 +527,11 @@ int main() {
     //printf("init;al=%u\n", m_get_total_bytes_allocated()); // 1600, due to qstr_init
     //delay_ms(1000);
 
-    #if 0
+    nlr_test();
+
+    #if 1
     // Python!
-    if (0) {
+    if (1) {
         //const char *pysrc = "def f():\n  x=x+1\nprint(42)\n";
         const char *pysrc =
             // impl01.py
@@ -521,6 +547,7 @@ int main() {
             "    x = x + 1\n";
             */
             // impl02.py
+            /*
             "#@micropython.native\n"
             "def f():\n"
             "    x = 0\n"
@@ -533,6 +560,7 @@ int main() {
             "            y = y + 1\n"
             "        x = x + 1\n"
             "f()\n";
+            */
             /*
             "print('in python!')\n"
             "x = 0\n"
@@ -573,6 +601,23 @@ int main() {
             "        x = x + 1\n"
             "flash(20)\n";
             */
+            // impl18.py
+            /*
+            "# basic exceptions\n"
+            "x = 1\n"
+            "try:\n"
+            "    x.a()\n"
+            "except:\n"
+            "    print(x)\n";
+            */
+            // impl19.py
+            "# for loop\n"
+            "def f():\n"
+            "    for x in range(400):\n"
+            "        for y in range(400):\n"
+            "            for z in range(400):\n"
+            "                pass\n"
+            "f()\n";
 
         py_lexer_t *lex = py_lexer_from_str_len("<>", pysrc, strlen(pysrc), false);
 
@@ -605,17 +650,30 @@ int main() {
 
                 py_obj_t module_fun = rt_make_function_from_id(1);
 
+                // flash once
                 led_state(PYB_LEDG1_PORT_NUM, 1);
                 delay_ms(100);
                 led_state(PYB_LEDG1_PORT_NUM, 0);
-                py_obj_t ret = rt_call_function_0(module_fun);
+
+                nlr_buf_t nlr;
+                if (nlr_push(&nlr) == 0) {
+                    py_obj_t ret = rt_call_function_0(module_fun);
+                    printf("done! got: ");
+                    py_obj_print(ret);
+                    printf("\n");
+                    nlr_pop();
+                } else {
+                    // uncaught exception
+                    printf("exception: ");
+                    py_obj_print((py_obj_t)nlr.ret_val);
+                    printf("\n");
+                }
+
+                // flash once
                 led_state(PYB_LEDG1_PORT_NUM, 1);
                 delay_ms(100);
                 led_state(PYB_LEDG1_PORT_NUM, 0);
 
-                printf("done! got: ");
-                py_obj_print(ret);
-                printf("\n");
                 delay_ms(1000);
                 printf("nalloc=%u\n", m_get_total_bytes_allocated());
                 delay_ms(1000);
@@ -690,7 +748,7 @@ int main() {
     }
 
     // fatfs testing
-    if (1) {
+    if (0) {
         FRESULT res = f_mount(&fatfs0, "0:", 1);
         if (res == FR_OK) {
             printf("mount success\n");
@@ -730,7 +788,7 @@ int main() {
         DWORD nclst;
         FATFS *fatfs;
         f_getfree("0:", &nclst, &fatfs);
-        printf("free=%d\n", nclst * fatfs->csize * 512);
+        printf("free=%u\n", (uint)(nclst * fatfs->csize * 512));
 
     }
 
@@ -745,7 +803,7 @@ int main() {
     }
 
     // USB testing
-    if (1) {
+    if (0) {
         void usb_init();
         usb_init();
     }