diff --git a/py/Makefile b/py/Makefile
index cb1b8d9452a853a030a95e46b79ad5f564d454db..4d36f3c9760e2badbcefa335dfbb32cc7506666e 100644
--- a/py/Makefile
+++ b/py/Makefile
@@ -19,6 +19,7 @@ SRC = \
 	emitx64.c \
 	emitthumb.c \
 	asmthumb.c \
+	emitinlinethumb.c \
 	runtime.c \
 	vm.c \
 	main.c \
diff --git a/py/asmthumb.c b/py/asmthumb.c
index 9850bec30c4d62aceeee33f0e2fa69c4f98a932f..91fff3955c284d0dff4d23c2f635c499eed9d954 100644
--- a/py/asmthumb.c
+++ b/py/asmthumb.c
@@ -20,7 +20,6 @@ struct _asm_thumb_t {
     byte *code_base;
     byte dummy_data[8];
 
-    int next_label;
     int max_num_labels;
     int *label_offsets;
     int num_locals;
@@ -65,7 +64,6 @@ void asm_thumb_free(asm_thumb_t *as, bool free_code) {
 void asm_thumb_start_pass(asm_thumb_t *as, int pass) {
     as->pass = pass;
     as->code_offset = 0;
-    as->next_label = 1;
     if (pass == ASM_THUMB_PASS_2) {
         memset(as->label_offsets, -1, as->max_num_labels * sizeof(int));
     }
@@ -212,10 +210,6 @@ void asm_thumb_exit(asm_thumb_t *as) {
     asm_thumb_write_op16(as, OP_POP_RLIST_PC(as->push_reglist));
 }
 
-int asm_thumb_label_new(asm_thumb_t *as) {
-    return as->next_label++;
-}
-
 void asm_thumb_label_assign(asm_thumb_t *as, int label) {
     assert(label < as->max_num_labels);
     if (as->pass == ASM_THUMB_PASS_2) {
@@ -234,43 +228,33 @@ static int get_label_dest(asm_thumb_t *as, int label) {
     return as->label_offsets[label];
 }
 
-// the i8 value will be zero extended into the r32 register!
-void asm_thumb_mov_reg_i8(asm_thumb_t *as, uint rlo_dest, int i8) {
+#define OP_MOVS_RLO_I8(rlo_dest, i8_src) (0x2000 | ((rlo_dest) << 8) | (i8_src))
+
+// the i8_src value will be zero extended into the r32 register!
+void asm_thumb_movs_rlo_i8(asm_thumb_t *as, uint rlo_dest, int i8_src) {
     assert(rlo_dest < REG_R8);
-    // movs rlo_dest, #i8
-    asm_thumb_write_op16(as, 0x2000 | (rlo_dest << 8) | i8);
+    // movs rlo_dest, #i8_src
+    asm_thumb_write_op16(as, OP_MOVS_RLO_I8(rlo_dest, i8_src));
 }
 
-// if loading lo half, the i16 value will be zero extended into the r32 register!
-void asm_thumb_mov_i16_to_reg(asm_thumb_t *as, int i16, uint reg_dest, bool load_hi_half) {
+#define OP_MOVW (0xf240)
+#define OP_MOVT (0xf2c0)
+
+// if loading lo half with movw, the i16 value will be zero extended into the r32 register!
+static void asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src) {
     assert(reg_dest < REG_R15);
-    uint op;
-    if (load_hi_half) {
-        // movt reg_dest, #i16
-        op = 0xf2c0;
-    } else {
-        // movw reg_dest, #i16
-        op = 0xf240;
-    }
-    asm_thumb_write_op32(as, op | ((i16 >> 1) & 0x0400) | ((i16 >> 12) & 0xf), ((i16 << 4) & 0x7000) | (reg_dest << 8) | (i16 & 0xff));
+    // mov[wt] reg_dest, #i16_src
+    asm_thumb_write_op32(as, mov_op | ((i16_src >> 1) & 0x0400) | ((i16_src >> 12) & 0xf), ((i16_src << 4) & 0x7000) | (reg_dest << 8) | (i16_src & 0xff));
 }
 
-void asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, machine_uint_t i32) {
-    // movw, movt does it in 8 bytes
-    // ldr [pc, #], dw does it in 6 bytes, but we might not reach to end of code for dw
-
-    asm_thumb_mov_i16_to_reg(as, i32, reg_dest, false);
-    asm_thumb_mov_i16_to_reg(as, i32 >> 16, reg_dest, true);
+// the i16_src value will be zero extended into the r32 register!
+void asm_thumb_movw_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src) {
+    asm_thumb_mov_reg_i16(as, OP_MOVW, reg_dest, i16_src);
 }
 
-void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32) {
-    if (reg_dest < 8 && UNSIGNED_FIT8(i32)) {
-        asm_thumb_mov_reg_i8(as, reg_dest, i32);
-    } else if (UNSIGNED_FIT16(i32)) {
-        asm_thumb_mov_i16_to_reg(as, i32, reg_dest, false);
-    } else {
-        asm_thumb_mov_reg_i32(as, reg_dest, i32);
-    }
+// the i16_src value will be zero extended into the r32 register!
+void asm_thumb_movt_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src) {
+    asm_thumb_mov_reg_i16(as, OP_MOVT, reg_dest, i16_src);
 }
 
 void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src) {
@@ -285,9 +269,69 @@ void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src) {
     } else {
         op_lo |= 0x80 | (reg_dest - 8);
     }
+    // mov reg_dest, reg_src
     asm_thumb_write_op16(as, 0x4600 | op_lo);
 }
 
+#define OP_SUBS_RLO_RLO_I3(rlo_dest, rlo_src, i3_src) (0x1e00 | ((i3_src) << 6) | ((rlo_src) << 3) | (rlo_dest))
+
+void asm_thumb_subs_rlo_rlo_i3(asm_thumb_t *as, uint rlo_dest, uint rlo_src, int i3_src) {
+    assert(rlo_dest < REG_R8);
+    assert(rlo_src < REG_R8);
+    asm_thumb_write_op16(as, OP_SUBS_RLO_RLO_I3(rlo_dest, rlo_src, i3_src));
+}
+
+#define OP_CMP_RLO_I8(rlo, i8) (0x2800 | ((rlo) << 8) | (i8))
+
+void asm_thumb_cmp_rlo_i8(asm_thumb_t *as, uint rlo, int i8) {
+    assert(rlo < REG_R8);
+    asm_thumb_write_op16(as, OP_CMP_RLO_I8(rlo, i8));
+}
+
+#define OP_BEQ_N(byte_offset) (0xd000 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BNE_N(byte_offset) (0xd100 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BCS_N(byte_offset) (0xd200 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BCC_N(byte_offset) (0xd300 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BMI_N(byte_offset) (0xd400 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BPL_N(byte_offset) (0xd500 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BVS_N(byte_offset) (0xd600 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BVC_N(byte_offset) (0xd700 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BHI_N(byte_offset) (0xd800 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BLS_N(byte_offset) (0xd900 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BGE_N(byte_offset) (0xda00 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BLT_N(byte_offset) (0xdb00 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BGT_N(byte_offset) (0xdc00 | (((byte_offset) >> 1) & 0x00ff))
+#define OP_BLE_N(byte_offset) (0xdd00 | (((byte_offset) >> 1) & 0x00ff))
+
+void asm_thumb_bgt_n(asm_thumb_t *as, int label) {
+    int dest = get_label_dest(as, label);
+    int rel = dest - as->code_offset;
+    rel -= 4; // account for instruction prefetch, PC is 4 bytes ahead of this instruction
+    if (SIGNED_FIT9(rel)) {
+        asm_thumb_write_op16(as, OP_BGT_N(rel));
+    } else {
+        printf("asm_thumb_bgt: branch does not fit in 9 bits\n");
+    }
+}
+
+void asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, machine_uint_t i32) {
+    // movw, movt does it in 8 bytes
+    // ldr [pc, #], dw does it in 6 bytes, but we might not reach to end of code for dw
+
+    asm_thumb_mov_reg_i16(as, OP_MOVW, reg_dest, i32);
+    asm_thumb_mov_reg_i16(as, OP_MOVT, reg_dest, i32 >> 16);
+}
+
+void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32) {
+    if (reg_dest < 8 && UNSIGNED_FIT8(i32)) {
+        asm_thumb_movs_rlo_i8(as, reg_dest, i32);
+    } else if (UNSIGNED_FIT16(i32)) {
+        asm_thumb_mov_reg_i16(as, OP_MOVW, reg_dest, i32);
+    } else {
+        asm_thumb_mov_reg_i32(as, reg_dest, i32);
+    }
+}
+
 #define OP_STR_TO_SP_OFFSET(rlo_dest, word_offset) (0x9000 | ((rlo_dest) << 8) | ((word_offset) & 0x00ff))
 #define OP_LDR_FROM_SP_OFFSET(rlo_dest, word_offset) (0x9800 | ((rlo_dest) << 8) | ((word_offset) & 0x00ff))
 
@@ -351,7 +395,6 @@ void asm_thumb_b_label(asm_thumb_t *as, int label) {
     }
 }
 
-#define OP_CMP_REG_IMM(rlo, i8) (0x2800 | ((rlo) << 8) | (i8))
 // all these bit arithmetics need coverage testing!
 #define OP_BEQ(byte_offset) (0xd000 | (((byte_offset) >> 1) & 0x00ff))
 #define OP_BEQW_HI(byte_offset) (0xf000 | (((byte_offset) >> 10) & 0x0400) | (((byte_offset) >> 14) & 0x003f))
@@ -361,7 +404,7 @@ void asm_thumb_cmp_reg_bz_label(asm_thumb_t *as, uint rlo, int label) {
     assert(rlo < REG_R8);
 
     // compare reg with 0
-    asm_thumb_write_op16(as, OP_CMP_REG_IMM(rlo, 0));
+    asm_thumb_write_op16(as, OP_CMP_RLO_I8(rlo, 0));
 
     // branch if equal
     int dest = get_label_dest(as, label);
@@ -369,7 +412,7 @@ void asm_thumb_cmp_reg_bz_label(asm_thumb_t *as, uint rlo, int label) {
     rel -= 4; // account for instruction prefetch, PC is 4 bytes ahead of this instruction
     if (dest >= 0 && rel <= -4) {
         // is a backwards jump, so we know the size of the jump on the first pass
-        // calculate rel assuming 12 bit relative jump
+        // calculate rel assuming 9 bit relative jump
         if (SIGNED_FIT9(rel)) {
             asm_thumb_write_op16(as, OP_BEQ(rel));
         } else {
diff --git a/py/asmthumb.h b/py/asmthumb.h
index bbf4ef20b39a0577e19e520fe495ed372ef85152..d712063871b18fe36ceaf81b726adf72ef5f9e85 100644
--- a/py/asmthumb.h
+++ b/py/asmthumb.h
@@ -38,23 +38,30 @@ void *asm_thumb_get_code(asm_thumb_t *as);
 void asm_thumb_entry(asm_thumb_t *as, int num_locals);
 void asm_thumb_exit(asm_thumb_t *as);
 
-int asm_thumb_label_new(asm_thumb_t *as);
 void asm_thumb_label_assign(asm_thumb_t *as, int label);
 
 // argument order follows ARM, in general dest is first
+// note there is a difference between movw and mov.w, and many others!
 
-void asm_thumb_mov_reg_i8(asm_thumb_t *as, uint rlo_dest, int i8_src);
-void asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, machine_uint_t i32_src);
-void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32_src);
+void asm_thumb_movs_rlo_i8(asm_thumb_t *as, uint rlo_dest, int i8_src);
+void asm_thumb_movw_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src);
+void asm_thumb_movt_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src);
 void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src);
-void asm_thumb_mov_local_reg(asm_thumb_t *as, int local_num_dest, uint rlo_src);
-void asm_thumb_mov_reg_local(asm_thumb_t *as, uint rlo_dest, int local_num);
-void asm_thumb_mov_reg_local_addr(asm_thumb_t *as, uint reg_dest, int local_num);
+void asm_thumb_subs_rlo_rlo_i3(asm_thumb_t *as, uint rlo_dest, uint rlo_src, int i3_src);
+void asm_thumb_cmp_rlo_i8(asm_thumb_t *as, uint rlo, int i8);
+void asm_thumb_bgt_n(asm_thumb_t *as, int label);
 
-void asm_thumb_add_reg_reg_reg(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, uint rlo_src_b);
-void asm_thumb_cmp_reg_reg(asm_thumb_t *as, uint rlo_a, uint rlo_b);
-void asm_thumb_ite_ge(asm_thumb_t *as);
+void asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, machine_uint_t i32_src); // convenience
+void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32_src); // convenience
+void asm_thumb_mov_local_reg(asm_thumb_t *as, int local_num_dest, uint rlo_src); // convenience
+void asm_thumb_mov_reg_local(asm_thumb_t *as, uint rlo_dest, int local_num); // convenience
+void asm_thumb_mov_reg_local_addr(asm_thumb_t *as, uint reg_dest, int local_num); // convenience
+
+void asm_thumb_add_reg_reg_reg(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, uint rlo_src_b); // convenience ?
+void asm_thumb_cmp_reg_reg(asm_thumb_t *as, uint rlo_a, uint rlo_b); // convenience ?
+void asm_thumb_ite_ge(asm_thumb_t *as); // convenience ?
+
+void asm_thumb_b_label(asm_thumb_t *as, int label); // convenience ?
+void asm_thumb_cmp_reg_bz_label(asm_thumb_t *as, uint rlo, int label); // convenience ?
+void asm_thumb_bl_ind(asm_thumb_t *as, void *fun_ptr, uint fun_id, uint reg_temp); // convenience ?
 
-void asm_thumb_b_label(asm_thumb_t *as, int label);
-void asm_thumb_cmp_reg_bz_label(asm_thumb_t *as, uint rlo, int label);
-void asm_thumb_bl_ind(asm_thumb_t *as, void *fun_ptr, uint fun_id, uint reg_temp);
diff --git a/py/compile.c b/py/compile.c
index 89a8d2bec53bd4c5c55876c412adcda0367003c5..6a3a6694644b40836c8fae206e34d37072ed2bba 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -26,6 +26,7 @@ typedef enum {
 } pn_kind_t;
 
 #define EMIT(fun, arg...) (comp->emit_method_table->fun(comp->emit, ##arg))
+#define EMIT_INLINE_ASM(fun, arg...) (comp->emit_inline_asm_method_table->fun(comp->emit_inline_asm, ##arg))
 
 #define EMIT_OPT_NONE           (0)
 #define EMIT_OPT_BYTE_CODE      (1)
@@ -47,7 +48,6 @@ typedef struct _compiler_t {
     pass_kind_t pass;
 
     int next_label;
-    int max_num_labels;
 
     int break_label;
     int continue_label;
@@ -66,6 +66,9 @@ typedef struct _compiler_t {
 
     emit_t *emit;                                   // current emitter
     const emit_method_table_t *emit_method_table;   // current emit method table
+
+    emit_inline_asm_t *emit_inline_asm;                                   // current emitter for inline asm
+    const emit_inline_asm_method_table_t *emit_inline_asm_method_table;   // current emit method table for inline asm
 } compiler_t;
 
 py_parse_node_t fold_constants(py_parse_node_t pn) {
@@ -2389,7 +2392,7 @@ void compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
             apply_to_single_or_list(comp, pns->nodes[1], PN_typedargslist, compile_scope_func_param);
         }
 
-        assert(pns->nodes[2] == 0); // 2 is something...
+        assert(PY_PARSE_NODE_IS_NULL(pns->nodes[2])); // 2 is something...
 
         compile_node(comp, pns->nodes[3]); // 3 is function body
         // emit return if it wasn't the last opcode
@@ -2492,9 +2495,77 @@ void compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
 
     EMIT(end_pass);
 
-    // update maximim number of labels needed
-    if (comp->next_label > comp->max_num_labels) {
-        comp->max_num_labels = comp->next_label;
+}
+
+void compile_scope_inline_asm(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
+    comp->pass = pass;
+    comp->scope_cur = scope;
+    comp->next_label = 1;
+
+    if (scope->kind != SCOPE_FUNCTION) {
+        printf("Error: inline assembler must be a function\n");
+        return;
+    }
+
+    // get the function definition parse node
+    assert(PY_PARSE_NODE_IS_STRUCT(scope->pn));
+    py_parse_node_struct_t *pns = (py_parse_node_struct_t*)scope->pn;
+    assert(PY_PARSE_NODE_STRUCT_KIND(pns) == PN_funcdef);
+
+    //qstr f_id = PY_PARSE_NODE_LEAF_ARG(pns->nodes[0]); // name
+
+    scope->num_params = 0;
+    assert(PY_PARSE_NODE_IS_NULL(pns->nodes[1])); // arguments
+    assert(PY_PARSE_NODE_IS_NULL(pns->nodes[2])); // type
+
+    py_parse_node_t pn_body = pns->nodes[3]; // body
+    py_parse_node_t *nodes;
+    int num = list_get(&pn_body, PN_suite_block_stmts, &nodes);
+
+    if (comp->pass > PASS_1) {
+        EMIT_INLINE_ASM(start_pass, comp->pass, comp->scope_cur);
+    }
+
+    if (comp->pass == PASS_3) {
+        //printf("----\n");
+        scope_print_info(scope);
+    }
+
+    for (int i = 0; i < num; i++) {
+        assert(PY_PARSE_NODE_IS_STRUCT(nodes[i]));
+        py_parse_node_struct_t *pns2 = (py_parse_node_struct_t*)nodes[i];
+        assert(PY_PARSE_NODE_STRUCT_KIND(pns2) == PN_expr_stmt);
+        assert(PY_PARSE_NODE_IS_STRUCT(pns2->nodes[0]));
+        assert(PY_PARSE_NODE_IS_NULL(pns2->nodes[1]));
+        pns2 = (py_parse_node_struct_t*)pns2->nodes[0];
+        assert(PY_PARSE_NODE_STRUCT_KIND(pns2) == PN_power);
+        assert(PY_PARSE_NODE_IS_ID(pns2->nodes[0]));
+        assert(PY_PARSE_NODE_IS_STRUCT_KIND(pns2->nodes[1], PN_trailer_paren));
+        assert(PY_PARSE_NODE_IS_NULL(pns2->nodes[2]));
+        qstr op = PY_PARSE_NODE_LEAF_ARG(pns2->nodes[0]);
+        pns2 = (py_parse_node_struct_t*)pns2->nodes[1]; // PN_trailer_paren
+        py_parse_node_t *pn_arg;
+        int n_args = list_get(&pns2->nodes[0], PN_arglist, &pn_arg);
+
+        // emit instructions
+        if (strcmp(qstr_str(op), "label") == 0) {
+            if (!(n_args == 1 && PY_PARSE_NODE_IS_ID(pn_arg[0]))) {
+                printf("SyntaxError: inline assembler 'label' requires 1 argument\n");
+                return;
+            }
+            int lab = comp_next_label(comp);
+            if (pass > PASS_1) {
+                EMIT_INLINE_ASM(label, lab, PY_PARSE_NODE_LEAF_ARG(pn_arg[0]));
+            }
+        } else {
+            if (pass > PASS_1) {
+                EMIT_INLINE_ASM(op, op, n_args, pn_arg);
+            }
+        }
+    }
+
+    if (comp->pass > PASS_1) {
+        EMIT_INLINE_ASM(end_pass);
     }
 }
 
@@ -2557,55 +2628,81 @@ void py_compile(py_parse_node_t pn) {
     comp->qstr_native = qstr_from_str_static("native");
     comp->qstr_asm_thumb = qstr_from_str_static("asm_thumb");
 
-    comp->max_num_labels = 0;
     comp->break_label = 0;
     comp->continue_label = 0;
     comp->except_nest_level = 0;
     comp->scope_head = NULL;
     comp->scope_cur = NULL;
 
-    comp->emit = emit_pass1_new(comp->qstr___class__);
-    comp->emit_method_table = &emit_pass1_method_table;
-
+    // optimise constants
     pn = fold_constants(pn);
+
+    // set the outer scope
     scope_new_and_link(comp, SCOPE_MODULE, pn, EMIT_OPT_NONE);
 
+    // compile pass 1
+    comp->emit = emit_pass1_new(comp->qstr___class__);
+    comp->emit_method_table = &emit_pass1_method_table;
+    comp->emit_inline_asm = NULL;
+    comp->emit_inline_asm_method_table = NULL;
+    uint max_num_labels = 0;
     for (scope_t *s = comp->scope_head; s != NULL; s = s->next) {
-        compile_scope(comp, s, PASS_1);
+        if (s->emit_options == EMIT_OPT_ASM_THUMB) {
+            compile_scope_inline_asm(comp, s, PASS_1);
+        } else {
+            compile_scope(comp, s, PASS_1);
+        }
+
+        // update maximim number of labels needed
+        if (comp->next_label > max_num_labels) {
+            max_num_labels = comp->next_label;
+        }
     }
 
+    // compute some things related to scope and identifiers
     for (scope_t *s = comp->scope_head; s != NULL; s = s->next) {
         compile_scope_compute_things(comp, s);
     }
 
+    // finish with pass 1
     emit_pass1_free(comp->emit);
 
+    // compile pass 2 and 3
     emit_t *emit_bc = NULL;
     emit_t *emit_x64 = NULL;
-
+    emit_inline_asm_t *emit_inline_thumb = NULL;
     for (scope_t *s = comp->scope_head; s != NULL; s = s->next) {
-        switch (s->emit_options) {
-            case EMIT_OPT_NATIVE_PYTHON:
-                if (emit_x64 == NULL) {
-                    emit_x64 = emit_x64_new(comp->max_num_labels);
-                }
-                comp->emit = emit_x64;
-                comp->emit_method_table = &emit_x64_method_table;
-                break;
-
-            //case EMIT_OPT_ASM_THUMB:
-                //if (em
+        if (s->emit_options == EMIT_OPT_ASM_THUMB) {
+            if (emit_inline_thumb == NULL) {
+                emit_inline_thumb = emit_inline_thumb_new(max_num_labels);
+            }
+            comp->emit = NULL;
+            comp->emit_method_table = NULL;
+            comp->emit_inline_asm = emit_inline_thumb;
+            comp->emit_inline_asm_method_table = &emit_inline_thumb_method_table;
+            compile_scope_inline_asm(comp, s, PASS_2);
+            compile_scope_inline_asm(comp, s, PASS_3);
+        } else {
+            switch (s->emit_options) {
+                case EMIT_OPT_NATIVE_PYTHON:
+                    if (emit_x64 == NULL) {
+                        emit_x64 = emit_x64_new(max_num_labels);
+                    }
+                    comp->emit = emit_x64;
+                    comp->emit_method_table = &emit_x64_method_table;
+                    break;
 
-            default:
-                if (emit_bc == NULL) {
-                    emit_bc = emit_bc_new(comp->max_num_labels);
-                }
-                comp->emit = emit_bc;
-                comp->emit_method_table = &emit_bc_method_table;
-                break;
+                default:
+                    if (emit_bc == NULL) {
+                        emit_bc = emit_bc_new(max_num_labels);
+                    }
+                    comp->emit = emit_bc;
+                    comp->emit_method_table = &emit_bc_method_table;
+                    break;
+            }
+            compile_scope(comp, s, PASS_2);
+            compile_scope(comp, s, PASS_3);
         }
-        compile_scope(comp, s, PASS_2);
-        compile_scope(comp, s, PASS_3);
     }
 
     m_free(comp);
diff --git a/py/emit.h b/py/emit.h
index 288141303bcbc4336e505b43b0509474d5d0d0b0..106a0a242a6bfdcea760ee3d8f159d652bad15e3 100644
--- a/py/emit.h
+++ b/py/emit.h
@@ -128,3 +128,16 @@ emit_t *emit_cpython_new(uint max_num_labels);
 emit_t *emit_bc_new(uint max_num_labels);
 emit_t *emit_x64_new(uint max_num_labels);
 emit_t *emit_thumb_new(uint max_num_labels);
+
+typedef struct _emit_inline_asm_t emit_inline_asm_t;
+
+typedef struct _emit_inline_asm_method_table_t {
+    void (*start_pass)(emit_inline_asm_t *emit, pass_kind_t pass, scope_t *scope);
+    void (*end_pass)(emit_inline_asm_t *emit);
+    void (*label)(emit_inline_asm_t *emit, int label_num, qstr label_id);
+    void (*op)(emit_inline_asm_t *emit, qstr op, int n_args, py_parse_node_t *args);
+} emit_inline_asm_method_table_t;
+
+extern const emit_inline_asm_method_table_t emit_inline_thumb_method_table;
+
+emit_inline_asm_t *emit_inline_thumb_new(uint max_num_labels);
diff --git a/py/emitbc.c b/py/emitbc.c
index 5c566ccb8c7df783dbd2b77a49818fc47e668355..81b059d1cb0f753f95e116e8d66ed36dc9cf45bd 100644
--- a/py/emitbc.c
+++ b/py/emitbc.c
@@ -17,7 +17,6 @@
 
 struct _emit_t {
     pass_kind_t pass;
-    int next_label;
     int stack_size;
     bool last_emit_was_return_value;
 
@@ -55,7 +54,6 @@ static void emit_bc_set_native_types(emit_t *emit, bool do_native_types) {
 
 static void emit_bc_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
     emit->pass = pass;
-    emit->next_label = 1;
     emit->stack_size = 0;
     emit->last_emit_was_return_value = false;
     emit->scope = scope;
diff --git a/py/emitcpy.c b/py/emitcpy.c
index db5c1186736bff3a86134e82972cdf3e8d9bf475..203fce70a15070f306cce9ecb9887bad5182fffc 100644
--- a/py/emitcpy.c
+++ b/py/emitcpy.c
@@ -18,7 +18,6 @@
 
 struct _emit_t {
     int pass;
-    int next_label;
     int byte_code_offset;
     int stack_size;
     bool last_emit_was_return_value;
@@ -44,7 +43,6 @@ static void emit_cpy_set_native_types(emit_t *emit, bool do_native_types) {
 
 static void emit_cpy_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
     emit->pass = pass;
-    emit->next_label = 1;
     emit->byte_code_offset = 0;
     emit->stack_size = 0;
     emit->last_emit_was_return_value = false;
diff --git a/py/emitinlinethumb.c b/py/emitinlinethumb.c
new file mode 100644
index 0000000000000000000000000000000000000000..9950f4a330b9e97d8525e279f9a07c9e8c98440b
--- /dev/null
+++ b/py/emitinlinethumb.c
@@ -0,0 +1,173 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "misc.h"
+#include "lexer.h"
+#include "machine.h"
+#include "parse.h"
+#include "scope.h"
+#include "runtime.h"
+#include "emit.h"
+#include "asmthumb.h"
+
+#ifdef EMIT_ENABLE_THUMB
+
+struct _emit_inline_asm_t {
+    int pass;
+    scope_t *scope;
+    int max_num_labels;
+    qstr *label_lookup;
+    asm_thumb_t *as;
+};
+
+emit_inline_asm_t *emit_inline_thumb_new(uint max_num_labels) {
+    emit_inline_asm_t *emit = m_new(emit_inline_asm_t, 1);
+    emit->max_num_labels = max_num_labels;
+    emit->label_lookup = m_new(qstr, max_num_labels);
+    memset(emit->label_lookup, 0, emit->max_num_labels * sizeof(qstr));
+    emit->as = asm_thumb_new(max_num_labels);
+    return emit;
+}
+
+static void emit_inline_thumb_start_pass(emit_inline_asm_t *emit, pass_kind_t pass, scope_t *scope) {
+    emit->pass = pass;
+    emit->scope = scope;
+    asm_thumb_start_pass(emit->as, pass);
+    asm_thumb_entry(emit->as, 0);
+}
+
+static void emit_inline_thumb_end_pass(emit_inline_asm_t *emit) {
+    asm_thumb_exit(emit->as);
+    asm_thumb_end_pass(emit->as);
+
+    if (emit->pass == PASS_3) {
+        py_fun_t f = asm_thumb_get_code(emit->as);
+        rt_assign_inline_asm_code(emit->scope->unique_code_id, f, asm_thumb_get_code_size(emit->as), emit->scope->num_params);
+    }
+}
+
+static void emit_inline_thumb_label(emit_inline_asm_t *emit, int label_num, qstr label_id) {
+    assert(label_num < emit->max_num_labels);
+    emit->label_lookup[label_num] = label_id;
+    asm_thumb_label_assign(emit->as, label_num);
+}
+
+static bool check_n_arg(qstr op, int n_args, int wanted_n_args) {
+    if (wanted_n_args == n_args) {
+        return true;
+    } else {
+        printf("SyntaxError: '%s' expects %d arguments'\n", qstr_str(op), wanted_n_args);
+        return false;
+    }
+}
+
+static uint get_arg_rlo(qstr op, py_parse_node_t *pn_arg, int wanted_arg_num) {
+    if (!PY_PARSE_NODE_IS_ID(pn_arg[wanted_arg_num])) {
+        printf("SyntaxError: '%s' expects a register in position %d\n", qstr_str(op), wanted_arg_num);
+        return 0;
+    }
+    qstr reg_qstr = PY_PARSE_NODE_LEAF_ARG(pn_arg[wanted_arg_num]);
+    const char *reg_str = qstr_str(reg_qstr);
+    if (!(strlen(reg_str) == 2 && reg_str[0] == 'r' && ('0' <= reg_str[1] && reg_str[1] <= '7'))) {
+        printf("SyntaxError: '%s' expects a register in position %d\n", qstr_str(op), wanted_arg_num);
+        return 0;
+    }
+    return reg_str[1] - '0';
+}
+
+static int get_arg_i(qstr op, py_parse_node_t *pn_arg, int wanted_arg_num, int fit_mask) {
+    if (!PY_PARSE_NODE_IS_SMALL_INT(pn_arg[wanted_arg_num])) {
+        printf("SyntaxError: '%s' expects an integer in position %d\n", qstr_str(op), wanted_arg_num);
+        return 0;
+    }
+    int i = PY_PARSE_NODE_LEAF_ARG(pn_arg[wanted_arg_num]);
+    if ((i & (~fit_mask)) != 0) {
+        printf("SyntaxError: '%s' integer 0x%x does not fit in mask 0x%x\n", qstr_str(op), i, fit_mask);
+        return 0;
+    }
+    return i;
+}
+
+static int get_arg_label(emit_inline_asm_t *emit, qstr op, py_parse_node_t *pn_arg, int wanted_arg_num) {
+    if (!PY_PARSE_NODE_IS_ID(pn_arg[wanted_arg_num])) {
+        printf("SyntaxError: '%s' expects a label in position %d\n", qstr_str(op), wanted_arg_num);
+        return 0;
+    }
+    qstr label_qstr = PY_PARSE_NODE_LEAF_ARG(pn_arg[wanted_arg_num]);
+    for (int i = 0; i < emit->max_num_labels; i++) {
+        if (emit->label_lookup[i] == label_qstr) {
+            return i;
+        }
+    }
+    printf("SyntaxError: label '%s' not defined\n", qstr_str(label_qstr));
+    return 0;
+}
+
+static void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, int n_args, py_parse_node_t *pn_arg) {
+    // TODO perhaps make two tables:
+    // two_args =
+    // "movs", RLO, I8, asm_thumb_movs_reg_i8
+    // "movw", REG, REG, asm_thumb_movw_reg_i16
+    // three_args =
+    // "subs", RLO, RLO, I3, asm_thumb_subs_reg_reg_i3
+
+    // 1 arg
+    if (strcmp(qstr_str(op), "bgt") == 0) {
+        if (!check_n_arg(op, n_args, 1)) {
+            return;
+        }
+        int label_num = get_arg_label(emit, op, pn_arg, 0);
+        asm_thumb_bgt_n(emit->as, label_num);
+
+    // 2 args
+    } else if (strcmp(qstr_str(op), "movs") == 0) {
+        if (!check_n_arg(op, n_args, 2)) {
+            return;
+        }
+        uint rlo_dest = get_arg_rlo(op, pn_arg, 0);
+        int i_src = get_arg_i(op, pn_arg, 1, 0xff);
+        asm_thumb_movs_rlo_i8(emit->as, rlo_dest, i_src);
+    } else if (strcmp(qstr_str(op), "movw") == 0) {
+        if (!check_n_arg(op, n_args, 2)) {
+            return;
+        }
+        uint rlo_dest = get_arg_rlo(op, pn_arg, 0); // TODO can be reg lo or hi
+        int i_src = get_arg_i(op, pn_arg, 1, 0xffff);
+        asm_thumb_movw_reg_i16(emit->as, rlo_dest, i_src);
+    } else if (strcmp(qstr_str(op), "cmp") == 0) {
+        if (!check_n_arg(op, n_args, 2)) {
+            return;
+        }
+        uint rlo = get_arg_rlo(op, pn_arg, 0);
+        int i8 = get_arg_i(op, pn_arg, 1, 0xff);
+        asm_thumb_cmp_rlo_i8(emit->as, rlo, i8);
+
+    // 3 args
+    } else if (strcmp(qstr_str(op), "subs") == 0) {
+        if (!check_n_arg(op, n_args, 3)) {
+            return;
+        }
+        uint rlo_dest = get_arg_rlo(op, pn_arg, 0);
+        uint rlo_src = get_arg_rlo(op, pn_arg, 1);
+        int i3_src = get_arg_i(op, pn_arg, 2, 0x7);
+        asm_thumb_subs_rlo_rlo_i3(emit->as, rlo_dest, rlo_src, i3_src);
+
+    // unknown op
+    } else {
+        printf("SyntaxError: unsupported ARM Thumb instruction '%s'\n", qstr_str(op));
+        return;
+    }
+}
+
+const emit_inline_asm_method_table_t emit_inline_thumb_method_table = {
+    emit_inline_thumb_start_pass,
+    emit_inline_thumb_end_pass,
+    emit_inline_thumb_label,
+    emit_inline_thumb_op,
+};
+
+#endif // EMIT_ENABLE_THUMB
diff --git a/py/emitthumb.c b/py/emitthumb.c
index 55336bca10e550b155f5f57a142ce110dc53bf9f..7bcdf9e43de899232a8b2045e8a342b6d3ab58e5 100644
--- a/py/emitthumb.c
+++ b/py/emitthumb.c
@@ -652,8 +652,8 @@ static void emit_thumb_compare_op(emit_t *emit, rt_compare_op_t op) {
         emit_pre_pop_reg_reg(emit, REG_ARG_2, REG_ARG_1);
         asm_thumb_cmp_reg_reg(emit->as, REG_ARG_1, REG_ARG_2);
         asm_thumb_ite_ge(emit->as);
-        asm_thumb_mov_reg_i8(emit->as, REG_RET, 0); // if r0 >= r1
-        asm_thumb_mov_reg_i8(emit->as, REG_RET, 1); // if r0 < r1
+        asm_thumb_movs_rlo_i8(emit->as, REG_RET, 0); // if r0 >= r1
+        asm_thumb_movs_rlo_i8(emit->as, REG_RET, 1); // if r0 < r1
         emit_post_push_reg(emit, REG_RET);
     } else {
         emit_pre_pop_reg_reg(emit, REG_ARG_3, REG_ARG_2);
diff --git a/py/parse.c b/py/parse.c
index 94a5a5d9cab6849668ea0f14a7f1f83756e36b68..74bfdf48d6cab40a4b49e937e25fc5fc7129b9e6 100644
--- a/py/parse.c
+++ b/py/parse.c
@@ -228,7 +228,7 @@ static void push_result_token(parser_t *parser, const py_lexer_t *lex) {
         }
         if (dec) {
             pn = py_parse_node_new_leaf(PY_PARSE_NODE_DECIMAL, qstr_from_strn_copy(str, len));
-        } else if (small_int && -0x10000 <= int_val && int_val <= 0xffff) {
+        } else if (small_int && -0x10000000 <= int_val && int_val <= 0xfffffff) { // XXX check this range formula!
             pn = py_parse_node_new_leaf(PY_PARSE_NODE_SMALL_INT, int_val);
         } else {
             pn = py_parse_node_new_leaf(PY_PARSE_NODE_INTEGER, qstr_from_strn_copy(str, len));
diff --git a/py/runtime.c b/py/runtime.c
index 26ecdc14fe63d96a59949a2a78f1f533d6322916..4ac680562cc01d1b773da8924757659ba4413862 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -40,6 +40,7 @@ typedef enum {
     O_FUN_2,
     O_FUN_N,
     O_FUN_BC,
+    O_FUN_ASM,
     O_BOUND_METH,
     O_LIST,
     O_SET,
@@ -73,14 +74,18 @@ typedef struct _py_obj_base_t {
         float_t flt;
 #endif
         struct { // for O_FUN_[012N]
-            void *fun;
             int n_args;
+            void *fun;
         } u_fun;
         struct { // for O_FUN_BC
+            int n_args;
             byte *code;
             uint len;
-            int n_args;
         } u_fun_bc;
+        struct { // for O_FUN_ASM
+            int n_args;
+            void *fun;
+        } u_fun_asm;
         struct { // for O_BOUND_METH
             py_obj_t meth;
             py_obj_t self;
@@ -275,21 +280,26 @@ static qstr q_len;
 static qstr q___build_class__;
 
 typedef enum {
-    PY_CODE_NATIVE,
+    PY_CODE_NONE,
     PY_CODE_BYTE,
+    PY_CODE_NATIVE,
+    PY_CODE_INLINE_ASM,
 } py_code_kind_t;
 
 typedef struct _py_code_t {
     py_code_kind_t kind;
     int n_args;
     union {
-        struct {
-            py_fun_t fun;
-        } u_native;
         struct {
             byte *code;
             uint len;
         } u_byte;
+        struct {
+            py_fun_t fun;
+        } u_native;
+        struct {
+            py_fun_t fun;
+        } u_inline_asm;
     };
 } py_code_t;
 
@@ -368,10 +378,30 @@ int rt_get_new_unique_code_id() {
     return next_unique_code_id++;
 }
 
-void rt_assign_native_code(int unique_code_id, py_fun_t fun, uint len, int n_args) {
+static void alloc_unique_codes() {
     if (unique_codes == NULL) {
         unique_codes = m_new(py_code_t, next_unique_code_id);
+        for (int i = 0; i < next_unique_code_id; i++) {
+            unique_codes[i].kind = PY_CODE_NONE;
+        }
     }
+}
+
+void rt_assign_byte_code(int unique_code_id, byte *code, uint len, int n_args) {
+    alloc_unique_codes();
+
+    assert(unique_code_id < next_unique_code_id);
+    unique_codes[unique_code_id].kind = PY_CODE_BYTE;
+    unique_codes[unique_code_id].n_args = n_args;
+    unique_codes[unique_code_id].u_byte.code = code;
+    unique_codes[unique_code_id].u_byte.len = len;
+
+    DEBUG_printf("assign byte code: id=%d code=%p len=%u n_args=%d\n", unique_code_id, code, len, n_args);
+}
+
+void rt_assign_native_code(int unique_code_id, py_fun_t fun, uint len, int n_args) {
+    alloc_unique_codes();
+
     assert(1 <= unique_code_id && unique_code_id < next_unique_code_id);
     unique_codes[unique_code_id].kind = PY_CODE_NATIVE;
     unique_codes[unique_code_id].n_args = n_args;
@@ -392,17 +422,27 @@ void rt_assign_native_code(int unique_code_id, py_fun_t fun, uint len, int n_arg
     }
 }
 
-void rt_assign_byte_code(int unique_code_id, byte *code, uint len, int n_args) {
-    if (unique_codes == NULL) {
-        unique_codes = m_new(py_code_t, next_unique_code_id);
-    }
-    assert(unique_code_id < next_unique_code_id);
-    unique_codes[unique_code_id].kind = PY_CODE_BYTE;
+void rt_assign_inline_asm_code(int unique_code_id, py_fun_t fun, uint len, int n_args) {
+    alloc_unique_codes();
+
+    assert(1 <= unique_code_id && unique_code_id < next_unique_code_id);
+    unique_codes[unique_code_id].kind = PY_CODE_INLINE_ASM;
     unique_codes[unique_code_id].n_args = n_args;
-    unique_codes[unique_code_id].u_byte.code = code;
-    unique_codes[unique_code_id].u_byte.len = len;
+    unique_codes[unique_code_id].u_inline_asm.fun = fun;
 
-    DEBUG_printf("assign byte code: id=%d code=%p len=%u n_args=%d\n", unique_code_id, code, len, n_args);
+    DEBUG_printf("assign inline asm code: id=%d fun=%p len=%u n_args=%d\n", unique_code_id, fun, len, n_args);
+    byte *fun_data = (byte*)(((machine_uint_t)fun) & (~1)); // need to clear lower bit in case it's thumb code
+    for (int i = 0; i < 128 && i < len; i++) {
+        if (i > 0 && i % 16 == 0) {
+            DEBUG_printf("\n");
+        }
+        DEBUG_printf(" %02x", fun_data[i]);
+    }
+    DEBUG_printf("\n");
+
+    if (fp_native != NULL) {
+        fwrite(fun_data, len, 1, fp_native);
+    }
 }
 
 const char *py_obj_get_type_str(py_obj_t o_in) {
@@ -649,6 +689,12 @@ py_obj_t rt_make_function_from_id(int unique_code_id) {
     py_code_t *c = &unique_codes[unique_code_id];
     py_obj_base_t *o = m_new(py_obj_base_t, 1);
     switch (c->kind) {
+        case PY_CODE_BYTE:
+            o->kind = O_FUN_BC;
+            o->u_fun_bc.n_args = c->n_args;
+            o->u_fun_bc.code = c->u_byte.code;
+            o->u_fun_bc.len = c->u_byte.len;
+            break;
         case PY_CODE_NATIVE:
             switch (c->n_args) {
                 case 0: o->kind = O_FUN_0; break;
@@ -658,11 +704,10 @@ py_obj_t rt_make_function_from_id(int unique_code_id) {
             }
             o->u_fun.fun = c->u_native.fun;
             break;
-        case PY_CODE_BYTE:
-            o->kind = O_FUN_BC;
-            o->u_fun_bc.code = c->u_byte.code;
-            o->u_fun_bc.len = c->u_byte.len;
-            o->u_fun_bc.n_args = c->n_args;
+        case PY_CODE_INLINE_ASM:
+            o->kind = O_FUN_ASM;
+            o->u_fun_asm.n_args = c->n_args;
+            o->u_fun_asm.fun = c->u_inline_asm.fun;
             break;
         default:
             assert(0);
@@ -695,8 +740,8 @@ py_obj_t rt_make_function(int n_args, py_fun_t code) {
     // assumes code is a pointer to a py_fun_t (i think this is safe...)
     py_obj_base_t *o = m_new(py_obj_base_t, 1);
     o->kind = O_FUN_N;
-    o->u_fun.fun = code;
     o->u_fun.n_args = n_args;
+    o->u_fun.fun = code;
     return o;
 }
 
diff --git a/py/runtime.h b/py/runtime.h
index 4c842b235ebe766ab589dc126e5a692213b904f3..72d589231ce84bcc3ceca49e3281b80225a757d2 100644
--- a/py/runtime.h
+++ b/py/runtime.h
@@ -87,8 +87,9 @@ extern py_obj_t py_const_true;
 void rt_init();
 void rt_deinit();
 int rt_get_new_unique_code_id();
-void rt_assign_native_code(int unique_code_id, py_fun_t f, uint len, int n_args);
 void rt_assign_byte_code(int unique_code_id, byte *code, uint len, int n_args);
+void rt_assign_native_code(int unique_code_id, py_fun_t f, uint len, int n_args);
+void rt_assign_inline_asm_code(int unique_code_id, py_fun_t f, uint len, int n_args);
 py_fun_t rt_get_code(qstr id);
 void py_obj_print(py_obj_t o);
 int rt_is_true(py_obj_t arg);