diff --git a/examples/asmled.py b/examples/asmled.py
index e0d6c73ce48784aaf8aeba61ae95c0b110597678..917d9ba03cd1ddcda64722888afa7003e8be0f92 100644
--- a/examples/asmled.py
+++ b/examples/asmled.py
@@ -1,4 +1,5 @@
 # flash LED #1 using inline assembler
+# this version is overly verbose and uses word stores
 @micropython.asm_thumb
 def flash_led(r0):
     movw(r1, (stm.GPIOA + stm.GPIO_BSRRL) & 0xffff)
@@ -13,69 +14,72 @@ def flash_led(r0):
     label(loop1)
 
     # turn LED on
-    str(r2, r1, 0)
+    str(r2, [r1, 0])
 
     # delay for a bit
     movw(r4, 5599900 & 0xffff)
     movt(r4, (5599900 >> 16) & 0xffff)
     label(delay_on)
-    subs(r4, r4, 1)
+    sub(r4, r4, 1)
     cmp(r4, 0)
     bgt(delay_on)
 
     # turn LED off
-    str(r3, r1, 0)
+    str(r3, [r1, 0])
 
     # delay for a bit
     movw(r4, 5599900 & 0xffff)
     movt(r4, (5599900 >> 16) & 0xffff)
     label(delay_off)
-    subs(r4, r4, 1)
+    sub(r4, r4, 1)
     cmp(r4, 0)
     bgt(delay_off)
 
     # loop r0 times
-    subs(r0, r0, 1)
+    sub(r0, r0, 1)
     label(loop_entry)
     cmp(r0, 0)
     bgt(loop1)
 
-# flash LED #1 using inline assembler
-# this version uses the convenience assembler operation 'movwt'
+# flash LED #2 using inline assembler
+# this version uses half-word sortes, and the convenience assembler operation 'movwt'
 @micropython.asm_thumb
 def flash_led_v2(r0):
-    movwt(r1, stm.GPIOA + stm.GPIO_BSRRL)
-    movwt(r2, 1 << 13)
-    movwt(r3, 1 << (16 + 13))
+    # get the GPIOA address in r1
+    movwt(r1, stm.GPIOA)
+
+    # get the bit mask for PA14 (the pin LED #2 is on)
+    movw(r2, 1 << 14)
 
     b(loop_entry)
 
     label(loop1)
 
     # turn LED on
-    str(r2, r1, 0)
+    strh(r2, [r1, stm.GPIO_BSRRL])
 
     # delay for a bit
     movwt(r4, 5599900)
     label(delay_on)
-    subs(r4, r4, 1)
+    sub(r4, r4, 1)
     cmp(r4, 0)
     bgt(delay_on)
 
     # turn LED off
-    str(r3, r1, 0)
+    strh(r2, [r1, stm.GPIO_BSRRH])
 
     # delay for a bit
     movwt(r4, 5599900)
     label(delay_off)
-    subs(r4, r4, 1)
+    sub(r4, r4, 1)
     cmp(r4, 0)
     bgt(delay_off)
 
     # loop r0 times
-    subs(r0, r0, 1)
+    sub(r0, r0, 1)
     label(loop_entry)
     cmp(r0, 0)
     bgt(loop1)
 
+flash_led(5)
 flash_led_v2(5)
diff --git a/examples/asmsum.py b/examples/asmsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..07e71c738492717c1b450295214f4828ef90df3b
--- /dev/null
+++ b/examples/asmsum.py
@@ -0,0 +1,57 @@
+@micropython.asm_thumb
+def asm_sum_words(r0, r1):
+
+    # r0 = len
+    # r1 = ptr
+    # r2 = sum
+    # r3 = dummy
+    mov(r2, 0)
+
+    b(loop_entry)
+
+    label(loop1)
+    ldr(r3, [r1, 0])
+    add(r2, r2, r3)
+
+    add(r1, r1, 4)
+    sub(r0, r0, 1)
+
+    label(loop_entry)
+    cmp(r0, 0)
+    bgt(loop1)
+
+    mov(r0, r2)
+
+@micropython.asm_thumb
+def asm_sum_bytes(r0, r1):
+
+    # r0 = len
+    # r1 = ptr
+    # r2 = sum
+    # r3 = dummy
+    mov(r2, 0)
+
+    b(loop_entry)
+
+    label(loop1)
+    ldrb(r3, [r1, 0])
+    add(r2, r2, r3)
+
+    add(r1, r1, 1)
+    sub(r0, r0, 1)
+
+    label(loop_entry)
+    cmp(r0, 0)
+    bgt(loop1)
+
+    mov(r0, r2)
+
+import array
+
+b = array.array('l', (100, 200, 300, 400))
+n = asm_sum_words(len(b), b)
+print(b, n)
+
+b = array.array('b', (10, 20, 30, 40, 50, 60, 70, 80))
+n = asm_sum_bytes(len(b), b)
+print(b, n)
diff --git a/py/asmthumb.c b/py/asmthumb.c
index 1cd971c76bbaa84f84b82c19b67afcb1575fe8ec..6bf6d665843e26efbb2a71a965ac2c492c84fb91 100644
--- a/py/asmthumb.c
+++ b/py/asmthumb.c
@@ -230,33 +230,33 @@ STATIC int get_label_dest(asm_thumb_t *as, uint label) {
     return as->label_offsets[label];
 }
 
-#define OP_MOVS_RLO_I8(rlo_dest, i8_src) (0x2000 | ((rlo_dest) << 8) | (i8_src))
+#define OP_FORMAT_2(op, rlo_dest, rlo_src, src_b) ((op) | ((src_b) << 6) | ((rlo_src) << 3) | (rlo_dest))
 
-// the i8_src value will be zero extended into the r32 register!
-void asm_thumb_movs_rlo_i8(asm_thumb_t *as, uint rlo_dest, int i8_src) {
+void asm_thumb_format_2(asm_thumb_t *as, uint op, uint rlo_dest, uint rlo_src, int src_b) {
     assert(rlo_dest < REG_R8);
-    // movs rlo_dest, #i8_src
-    asm_thumb_write_op16(as, OP_MOVS_RLO_I8(rlo_dest, i8_src));
+    assert(rlo_src < REG_R8);
+    asm_thumb_write_op16(as, OP_FORMAT_2(op, rlo_dest, rlo_src, src_b));
 }
 
-#define OP_MOVW (0xf240)
-#define OP_MOVT (0xf2c0)
+#define OP_FORMAT_3(op, rlo, i8) ((op) | ((rlo) << 8) | (i8))
 
-// if loading lo half with movw, the i16 value will be zero extended into the r32 register!
-STATIC void asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src) {
-    assert(reg_dest < REG_R15);
-    // mov[wt] reg_dest, #i16_src
-    asm_thumb_write_op32(as, mov_op | ((i16_src >> 1) & 0x0400) | ((i16_src >> 12) & 0xf), ((i16_src << 4) & 0x7000) | (reg_dest << 8) | (i16_src & 0xff));
+void asm_thumb_format_3(asm_thumb_t *as, uint op, uint rlo, int i8) {
+    assert(rlo < REG_R8);
+    asm_thumb_write_op16(as, OP_FORMAT_3(op, rlo, i8));
 }
 
-// the i16_src value will be zero extended into the r32 register!
-void asm_thumb_movw_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src) {
-    asm_thumb_mov_reg_i16(as, OP_MOVW, reg_dest, i16_src);
+#define OP_FORMAT_4(op, rlo_dest, rlo_src) ((op) | ((rlo_src) << 3) | (rlo_dest))
+
+void asm_thumb_format_4(asm_thumb_t *as, uint op, uint rlo_dest, uint rlo_src) {
+    assert(rlo_dest < REG_R8);
+    assert(rlo_src < REG_R8);
+    asm_thumb_write_op16(as, OP_FORMAT_4(op, rlo_dest, rlo_src));
 }
 
-// the i16_src value will be zero extended into the r32 register!
-void asm_thumb_movt_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src) {
-    asm_thumb_mov_reg_i16(as, OP_MOVT, reg_dest, i16_src);
+#define OP_FORMAT_9_10(op, rlo_dest, rlo_base, offset) ((op) | (((offset) << 6) & 0x07c0) | ((rlo_base) << 3) | (rlo_dest))
+
+void asm_thumb_format_9_10(asm_thumb_t *as, uint op, uint rlo_dest, uint rlo_base, uint offset) {
+    asm_thumb_write_op16(as, OP_FORMAT_9_10(op, rlo_dest, rlo_base, offset));
 }
 
 void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src) {
@@ -275,42 +275,24 @@ void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src) {
     asm_thumb_write_op16(as, 0x4600 | op_lo);
 }
 
-#define OP_ADD_RLO_RLO_RLO(rlo_dest, rlo_src_a, rlo_src_b) (0x1800 | ((rlo_src_b) << 6) | ((rlo_src_a) << 3) | (rlo_dest))
-
-void asm_thumb_add_rlo_rlo_rlo(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, uint rlo_src_b) {
-    asm_thumb_write_op16(as, OP_ADD_RLO_RLO_RLO(rlo_dest, rlo_src_a, rlo_src_b));
-}
-
-#define OP_SUBS_RLO_RLO_I3(rlo_dest, rlo_src, i3_src) (0x1e00 | ((i3_src) << 6) | ((rlo_src) << 3) | (rlo_dest))
-
-void asm_thumb_subs_rlo_rlo_i3(asm_thumb_t *as, uint rlo_dest, uint rlo_src, int i3_src) {
-    assert(rlo_dest < REG_R8);
-    assert(rlo_src < REG_R8);
-    asm_thumb_write_op16(as, OP_SUBS_RLO_RLO_I3(rlo_dest, rlo_src, i3_src));
-}
-
-#define OP_CMP_REG_REG(rlo_a, rlo_b) (0x4280 | ((rlo_b) << 3) | (rlo_a))
-
-void asm_thumb_cmp_reg_reg(asm_thumb_t *as, uint rlo_a, uint rlo_b) {
-    asm_thumb_write_op16(as, OP_CMP_REG_REG(rlo_a, rlo_b));
-}
-
-#define OP_CMP_RLO_I8(rlo, i8) (0x2800 | ((rlo) << 8) | (i8))
+#define OP_MOVW (0xf240)
+#define OP_MOVT (0xf2c0)
 
-void asm_thumb_cmp_rlo_i8(asm_thumb_t *as, uint rlo, int i8) {
-    assert(rlo < REG_R8);
-    asm_thumb_write_op16(as, OP_CMP_RLO_I8(rlo, i8));
+// if loading lo half with movw, the i16 value will be zero extended into the r32 register!
+STATIC void asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src) {
+    assert(reg_dest < REG_R15);
+    // mov[wt] reg_dest, #i16_src
+    asm_thumb_write_op32(as, mov_op | ((i16_src >> 1) & 0x0400) | ((i16_src >> 12) & 0xf), ((i16_src << 4) & 0x7000) | (reg_dest << 8) | (i16_src & 0xff));
 }
 
-#define OP_LDR_RLO_RLO_I5(rlo_dest, rlo_base, word_offset) (0x6800 | (((word_offset) << 6) & 0x07c0) | ((rlo_base) << 3) | (rlo_dest))
-#define OP_STR_RLO_RLO_I5(rlo_dest, rlo_base, word_offset) (0x6000 | (((word_offset) << 6) & 0x07c0) | ((rlo_base) << 3) | (rlo_dest))
-
-void asm_thumb_ldr_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint word_offset) {
-    asm_thumb_write_op16(as, OP_LDR_RLO_RLO_I5(rlo_dest, rlo_base, word_offset));
+// the i16_src value will be zero extended into the r32 register!
+void asm_thumb_movw_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src) {
+    asm_thumb_mov_reg_i16(as, OP_MOVW, reg_dest, i16_src);
 }
 
-void asm_thumb_str_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint rlo_base, uint word_offset) {
-    asm_thumb_write_op16(as, OP_STR_RLO_RLO_I5(rlo_src, rlo_base, word_offset));
+// the i16_src value will be zero extended into the r32 register!
+void asm_thumb_movt_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src) {
+    asm_thumb_mov_reg_i16(as, OP_MOVT, reg_dest, i16_src);
 }
 
 void asm_thumb_ite_ge(asm_thumb_t *as) {
@@ -353,7 +335,7 @@ void asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, machine_uint_t i32) {
 
 void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32) {
     if (reg_dest < 8 && UNSIGNED_FIT8(i32)) {
-        asm_thumb_movs_rlo_i8(as, reg_dest, i32);
+        asm_thumb_mov_rlo_i8(as, reg_dest, i32);
     } else if (UNSIGNED_FIT16(i32)) {
         asm_thumb_mov_reg_i16(as, OP_MOVW, reg_dest, i32);
     } else {
@@ -452,7 +434,7 @@ void asm_thumb_bl_ind(asm_thumb_t *as, void *fun_ptr, uint fun_id, uint reg_temp
         asm_thumb_mov_reg_i32(as, reg_temp, (machine_uint_t)fun_ptr);
         asm_thumb_write_op16(as, OP_BLX(reg_temp));
     } else if (1) {
-        asm_thumb_write_op16(as, OP_LDR_RLO_RLO_I5(reg_temp, REG_R7, fun_id));
+        asm_thumb_write_op16(as, OP_FORMAT_9_10(ASM_THUMB_FORMAT_9_LDR | ASM_THUMB_FORMAT_9_WORD_TRANSFER, reg_temp, REG_R7, fun_id));
         asm_thumb_write_op16(as, OP_BLX(reg_temp));
     } else {
         // use SVC
diff --git a/py/asmthumb.h b/py/asmthumb.h
index de376fd2ce71caaad6866d6317f3107aafe507f2..6b4f5506b69cec0a86605010f1257a0063919f13 100644
--- a/py/asmthumb.h
+++ b/py/asmthumb.h
@@ -58,16 +58,93 @@ void asm_thumb_label_assign(asm_thumb_t *as, uint label);
 // argument order follows ARM, in general dest is first
 // note there is a difference between movw and mov.w, and many others!
 
-void asm_thumb_movs_rlo_i8(asm_thumb_t *as, uint rlo_dest, int i8_src);
+// FORMAT 2: add/subtract
+
+#define ASM_THUMB_FORMAT_2_ADD (0x1800)
+#define ASM_THUMB_FORMAT_2_SUB (0x1a00)
+#define ASM_THUMB_FORMAT_2_REG_OPERAND (0x0000)
+#define ASM_THUMB_FORMAT_2_IMM_OPERAND (0x0400)
+
+void asm_thumb_format_2(asm_thumb_t *as, uint op, uint rlo_dest, uint rlo_src, int src_b);
+
+static inline void asm_thumb_add_rlo_rlo_rlo(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, uint rlo_src_b)
+    { asm_thumb_format_2(as, ASM_THUMB_FORMAT_2_ADD | ASM_THUMB_FORMAT_2_REG_OPERAND, rlo_dest, rlo_src_a, rlo_src_b); }
+static inline void asm_thumb_add_rlo_rlo_i3(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, int i3_src)
+    { asm_thumb_format_2(as, ASM_THUMB_FORMAT_2_ADD | ASM_THUMB_FORMAT_2_IMM_OPERAND, rlo_dest, rlo_src_a, i3_src); }
+static inline void asm_thumb_sub_rlo_rlo_rlo(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, uint rlo_src_b)
+    { asm_thumb_format_2(as, ASM_THUMB_FORMAT_2_SUB | ASM_THUMB_FORMAT_2_REG_OPERAND, rlo_dest, rlo_src_a, rlo_src_b); }
+static inline void asm_thumb_sub_rlo_rlo_i3(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, int i3_src)
+    { asm_thumb_format_2(as, ASM_THUMB_FORMAT_2_SUB | ASM_THUMB_FORMAT_2_IMM_OPERAND, rlo_dest, rlo_src_a, i3_src); }
+
+// FORMAT 3: move/compare/add/subtract immediate
+// These instructions all do zero extension of the i8 value
+
+#define ASM_THUMB_FORMAT_3_MOV (0x2000)
+#define ASM_THUMB_FORMAT_3_CMP (0x2800)
+#define ASM_THUMB_FORMAT_3_ADD (0x3000)
+#define ASM_THUMB_FORMAT_3_SUB (0x3800)
+
+void asm_thumb_format_3(asm_thumb_t *as, uint op, uint rlo, int i8);
+
+static inline void asm_thumb_mov_rlo_i8(asm_thumb_t *as, uint rlo, int i8) { asm_thumb_format_3(as, ASM_THUMB_FORMAT_3_MOV, rlo, i8); }
+static inline void asm_thumb_cmp_rlo_i8(asm_thumb_t *as, uint rlo, int i8) { asm_thumb_format_3(as, ASM_THUMB_FORMAT_3_CMP, rlo, i8); }
+static inline void asm_thumb_add_rlo_i8(asm_thumb_t *as, uint rlo, int i8) { asm_thumb_format_3(as, ASM_THUMB_FORMAT_3_ADD, rlo, i8); }
+static inline void asm_thumb_sub_rlo_i8(asm_thumb_t *as, uint rlo, int i8) { asm_thumb_format_3(as, ASM_THUMB_FORMAT_3_SUB, rlo, i8); }
+
+// FORMAT 4: ALU operations
+
+#define ASM_THUMB_FORMAT_4_AND (0x4000)
+#define ASM_THUMB_FORMAT_4_EOR (0x4040)
+#define ASM_THUMB_FORMAT_4_LSL (0x4080)
+#define ASM_THUMB_FORMAT_4_LSR (0x40c0)
+#define ASM_THUMB_FORMAT_4_ASR (0x4100)
+#define ASM_THUMB_FORMAT_4_ADC (0x4140)
+#define ASM_THUMB_FORMAT_4_SBC (0x4180)
+#define ASM_THUMB_FORMAT_4_ROR (0x41c0)
+#define ASM_THUMB_FORMAT_4_TST (0x4200)
+#define ASM_THUMB_FORMAT_4_NEG (0x4240)
+#define ASM_THUMB_FORMAT_4_CMP (0x4280)
+#define ASM_THUMB_FORMAT_4_CMN (0x42c0)
+#define ASM_THUMB_FORMAT_4_ORR (0x4300)
+#define ASM_THUMB_FORMAT_4_MUL (0x4340)
+#define ASM_THUMB_FORMAT_4_BIC (0x4380)
+#define ASM_THUMB_FORMAT_4_MVN (0x43c0)
+
+void asm_thumb_format_4(asm_thumb_t *as, uint op, uint rlo_dest, uint rlo_src);
+
+static inline void asm_thumb_cmp_rlo_rlo(asm_thumb_t *as, uint rlo_dest, uint rlo_src) { asm_thumb_format_4(as, ASM_THUMB_FORMAT_4_CMP, rlo_dest, rlo_src); }
+
+// FORMAT 9: load/store with immediate offset
+// For word transfers the offset must be aligned, and >>2
+
+// FORMAT 10: load/store halfword
+// The offset must be aligned, and >>1
+// The load is zero extended into the register
+
+#define ASM_THUMB_FORMAT_9_STR (0x6000)
+#define ASM_THUMB_FORMAT_9_LDR (0x6800)
+#define ASM_THUMB_FORMAT_9_WORD_TRANSFER (0x0000)
+#define ASM_THUMB_FORMAT_9_BYTE_TRANSFER (0x1000)
+
+#define ASM_THUMB_FORMAT_10_STRH (0x8000)
+#define ASM_THUMB_FORMAT_10_LDRH (0x8800)
+
+void asm_thumb_format_9_10(asm_thumb_t *as, uint op, uint rlo_dest, uint rlo_base, uint offset);
+
+static inline void asm_thumb_str_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint rlo_base, uint word_offset)
+    { asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_9_STR | ASM_THUMB_FORMAT_9_WORD_TRANSFER, rlo_src, rlo_base, word_offset); }
+static inline void asm_thumb_strb_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint rlo_base, uint byte_offset)
+    { asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_9_STR | ASM_THUMB_FORMAT_9_BYTE_TRANSFER, rlo_src, rlo_base, byte_offset); }
+static inline void asm_thumb_ldr_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint word_offset)
+    { asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_9_LDR | ASM_THUMB_FORMAT_9_WORD_TRANSFER, rlo_dest, rlo_base, word_offset); }
+static inline void asm_thumb_ldrb_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint byte_offset)
+    { asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_9_LDR | ASM_THUMB_FORMAT_9_BYTE_TRANSFER , rlo_dest, rlo_base, byte_offset); }
+
+// TODO convert these to above format style
+
+void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src);
 void asm_thumb_movw_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src);
 void asm_thumb_movt_reg_i16(asm_thumb_t *as, uint reg_dest, int i16_src);
-void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src);
-void asm_thumb_add_rlo_rlo_rlo(asm_thumb_t *as, uint rlo_dest, uint rlo_src_a, uint rlo_src_b);
-void asm_thumb_subs_rlo_rlo_i3(asm_thumb_t *as, uint rlo_dest, uint rlo_src, int i3_src);
-void asm_thumb_cmp_reg_reg(asm_thumb_t *as, uint rlo_a, uint rlo_b);
-void asm_thumb_cmp_rlo_i8(asm_thumb_t *as, uint rlo, int i8);
-void asm_thumb_ldr_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint word_offset);
-void asm_thumb_str_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint rlo_base, uint word_offset);
 void asm_thumb_ite_ge(asm_thumb_t *as);
 void asm_thumb_b_n(asm_thumb_t *as, uint label);
 void asm_thumb_bcc_n(asm_thumb_t *as, int cond, uint label);
diff --git a/py/emitinlinethumb.c b/py/emitinlinethumb.c
index 58aeed1f640fa194d8560859edf1d1d2847a8bc1..fc881f3d189d9221349e3ea7bc485ed9153fd922 100644
--- a/py/emitinlinethumb.c
+++ b/py/emitinlinethumb.c
@@ -17,6 +17,14 @@
 
 #if MICROPY_EMIT_INLINE_THUMB
 
+typedef enum {
+    PN_none = 0,
+#define DEF_RULE(rule, comp, kind, ...) PN_##rule,
+#include "grammar.h"
+#undef DEF_RULE
+    PN_maximum_number_of,
+} pn_kind_t;
+
 struct _emit_inline_asm_t {
     uint16_t pass;
     uint16_t success;
@@ -120,15 +128,15 @@ STATIC const reg_name_t reg_name_table[] = {
     {15, "pc\0"},
 };
 
-STATIC uint get_arg_reg(emit_inline_asm_t *emit, const char *op, mp_parse_node_t *pn_args, uint wanted_arg_num, uint max_reg) {
-    if (MP_PARSE_NODE_IS_ID(pn_args[wanted_arg_num])) {
-        qstr reg_qstr = MP_PARSE_NODE_LEAF_ARG(pn_args[wanted_arg_num]);
+STATIC uint get_arg_reg(emit_inline_asm_t *emit, const char *op, mp_parse_node_t pn, uint max_reg) {
+    if (MP_PARSE_NODE_IS_ID(pn)) {
+        qstr reg_qstr = MP_PARSE_NODE_LEAF_ARG(pn);
         const char *reg_str = qstr_str(reg_qstr);
         for (uint i = 0; i < sizeof(reg_name_table) / sizeof(reg_name_table[0]); i++) {
             const reg_name_t *r = &reg_name_table[i];
             if (reg_str[0] == r->name[0] && reg_str[1] == r->name[1] && reg_str[2] == r->name[2] && (reg_str[2] == '\0' || reg_str[3] == '\0')) {
                 if (r->reg > max_reg) {
-                    emit_inline_thumb_error(emit, "'%s' expects at most r%d in position %d\n", op, max_reg, wanted_arg_num);
+                    emit_inline_thumb_error(emit, "'%s' expects at most r%d\n", op, max_reg);
                     return 0;
                 } else {
                     return r->reg;
@@ -136,16 +144,16 @@ STATIC uint get_arg_reg(emit_inline_asm_t *emit, const char *op, mp_parse_node_t
             }
         }
     }
-    emit_inline_thumb_error(emit, "'%s' expects a register in position %d\n", op, wanted_arg_num);
+    emit_inline_thumb_error(emit, "'%s' expects a register\n", op);
     return 0;
 }
 
-STATIC int get_arg_i(emit_inline_asm_t *emit, const char *op, mp_parse_node_t *pn_args, int wanted_arg_num, int fit_mask) {
-    if (!MP_PARSE_NODE_IS_SMALL_INT(pn_args[wanted_arg_num])) {
-        emit_inline_thumb_error(emit, "'%s' expects an integer in position %d\n", op, wanted_arg_num);
+STATIC int get_arg_i(emit_inline_asm_t *emit, const char *op, mp_parse_node_t pn, int fit_mask) {
+    if (!MP_PARSE_NODE_IS_SMALL_INT(pn)) {
+        emit_inline_thumb_error(emit, "'%s' expects an integer\n", op);
         return 0;
     }
-    int i = MP_PARSE_NODE_LEAF_SMALL_INT(pn_args[wanted_arg_num]);
+    int i = MP_PARSE_NODE_LEAF_SMALL_INT(pn);
     if ((i & (~fit_mask)) != 0) {
         emit_inline_thumb_error(emit, "'%s' integer 0x%x does not fit in mask 0x%x\n", op, i, fit_mask);
         return 0;
@@ -153,12 +161,34 @@ STATIC int get_arg_i(emit_inline_asm_t *emit, const char *op, mp_parse_node_t *p
     return i;
 }
 
-STATIC int get_arg_label(emit_inline_asm_t *emit, const char *op, mp_parse_node_t *pn_args, int wanted_arg_num) {
-    if (!MP_PARSE_NODE_IS_ID(pn_args[wanted_arg_num])) {
-        emit_inline_thumb_error(emit, "'%s' expects a label in position %d\n", op, wanted_arg_num);
+STATIC bool get_arg_addr(emit_inline_asm_t *emit, const char *op, mp_parse_node_t pn, mp_parse_node_t *pn_base, mp_parse_node_t *pn_offset) {
+    if (!MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_atom_bracket)) {
+        goto bad_arg;
+    }
+    mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
+    if (!MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_testlist_comp)) {
+        goto bad_arg;
+    }
+    pns = (mp_parse_node_struct_t*)pns->nodes[0];
+    if (MP_PARSE_NODE_STRUCT_NUM_NODES(pns) != 2) {
+        goto bad_arg;
+    }
+
+    *pn_base = pns->nodes[0];
+    *pn_offset = pns->nodes[1];
+    return true;
+
+bad_arg:
+    emit_inline_thumb_error(emit, "'%s' expects an address of the form [a, b]\n", op);
+    return false;
+}
+
+STATIC int get_arg_label(emit_inline_asm_t *emit, const char *op, mp_parse_node_t pn) {
+    if (!MP_PARSE_NODE_IS_ID(pn)) {
+        emit_inline_thumb_error(emit, "'%s' expects a label\n", op);
         return 0;
     }
-    qstr label_qstr = MP_PARSE_NODE_LEAF_ARG(pn_args[wanted_arg_num]);
+    qstr label_qstr = MP_PARSE_NODE_LEAF_ARG(pn);
     for (int i = 0; i < emit->max_num_labels; i++) {
         if (emit->label_lookup[i] == label_qstr) {
             return i;
@@ -212,7 +242,7 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, int n_args, m
 
     } else if (n_args == 1) {
         if (strcmp(op_str, "b") == 0) {
-            int label_num = get_arg_label(emit, op_str, pn_args, 0);
+            int label_num = get_arg_label(emit, op_str, pn_args[0]);
             // TODO check that this succeeded, ie branch was within range
             asm_thumb_b_n(emit->as, label_num);
         } else if (op_str[0] == 'b' && op_len == 3) {
@@ -225,7 +255,7 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, int n_args, m
             if (cc == -1) {
                 goto unknown_op;
             }
-            int label_num = get_arg_label(emit, op_str, pn_args, 0);
+            int label_num = get_arg_label(emit, op_str, pn_args[0]);
             // TODO check that this succeeded, ie branch was within range
             asm_thumb_bcc_n(emit->as, cc, label_num);
         } else {
@@ -233,59 +263,131 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, int n_args, m
         }
 
     } else if (n_args == 2) {
-        if (strcmp(op_str, "mov") == 0) {
-            uint rlo_dest = get_arg_reg(emit, op_str, pn_args, 0, 7);
-            uint rlo_src = get_arg_reg(emit, op_str, pn_args, 1, 7);
-            asm_thumb_mov_reg_reg(emit->as, rlo_dest, rlo_src);
-        } else if (strcmp(op_str, "movs") == 0) {
-            uint rlo_dest = get_arg_reg(emit, op_str, pn_args, 0, 7);
-            int i_src = get_arg_i(emit, op_str, pn_args, 1, 0xff);
-            asm_thumb_movs_rlo_i8(emit->as, rlo_dest, i_src);
-        } else if (strcmp(op_str, "movw") == 0) {
-            uint reg_dest = get_arg_reg(emit, op_str, pn_args, 0, 15);
-            int i_src = get_arg_i(emit, op_str, pn_args, 1, 0xffff);
-            asm_thumb_movw_reg_i16(emit->as, reg_dest, i_src);
-        } else if (strcmp(op_str, "movt") == 0) {
-            uint reg_dest = get_arg_reg(emit, op_str, pn_args, 0, 15);
-            int i_src = get_arg_i(emit, op_str, pn_args, 1, 0xffff);
-            asm_thumb_movt_reg_i16(emit->as, reg_dest, i_src);
-        } else if (strcmp(op_str, "movwt") == 0) {
-            // this is a convenience instruction
-            // we clear the MSB since it might be set from extracting the small int value
-            uint reg_dest = get_arg_reg(emit, op_str, pn_args, 0, 15);
-            int i_src = get_arg_i(emit, op_str, pn_args, 1, 0xffffffff);
-            asm_thumb_movw_reg_i16(emit->as, reg_dest, i_src & 0xffff);
-            asm_thumb_movt_reg_i16(emit->as, reg_dest, (i_src >> 16) & 0x7fff);
-        } else if (strcmp(op_str, "cmp") == 0) {
-            uint rlo = get_arg_reg(emit, op_str, pn_args, 0, 7);
-            int i8 = get_arg_i(emit, op_str, pn_args, 1, 0xff);
-            asm_thumb_cmp_rlo_i8(emit->as, rlo, i8);
+        if (MP_PARSE_NODE_IS_ID(pn_args[1])) {
+            // second arg is a register (or should be)
+            uint op_code;
+            if (strcmp(op_str, "mov") == 0) {
+                uint reg_dest = get_arg_reg(emit, op_str, pn_args[0], 15);
+                uint reg_src = get_arg_reg(emit, op_str, pn_args[1], 15);
+                asm_thumb_mov_reg_reg(emit->as, reg_dest, reg_src);
+            } else if (strcmp(op_str, "and") == 0) {
+                op_code = ASM_THUMB_FORMAT_4_AND;
+                uint reg_dest, reg_src;
+                op_format_4:
+                reg_dest = get_arg_reg(emit, op_str, pn_args[0], 7);
+                reg_src = get_arg_reg(emit, op_str, pn_args[1], 7);
+                asm_thumb_format_4(emit->as, op_code, reg_dest, reg_src);
+            // TODO probably uses less ROM if these ops are in a lookup table
+            } else if (strcmp(op_str, "and") == 0) { op_code = ASM_THUMB_FORMAT_4_AND; goto op_format_4;
+            } else if (strcmp(op_str, "eor") == 0) { op_code = ASM_THUMB_FORMAT_4_EOR; goto op_format_4;
+            } else if (strcmp(op_str, "lsl") == 0) { op_code = ASM_THUMB_FORMAT_4_LSL; goto op_format_4;
+            } else if (strcmp(op_str, "lsr") == 0) { op_code = ASM_THUMB_FORMAT_4_LSR; goto op_format_4;
+            } else if (strcmp(op_str, "asr") == 0) { op_code = ASM_THUMB_FORMAT_4_ASR; goto op_format_4;
+            } else if (strcmp(op_str, "adc") == 0) { op_code = ASM_THUMB_FORMAT_4_ADC; goto op_format_4;
+            } else if (strcmp(op_str, "sbc") == 0) { op_code = ASM_THUMB_FORMAT_4_SBC; goto op_format_4;
+            } else if (strcmp(op_str, "ror") == 0) { op_code = ASM_THUMB_FORMAT_4_ROR; goto op_format_4;
+            } else if (strcmp(op_str, "tst") == 0) { op_code = ASM_THUMB_FORMAT_4_TST; goto op_format_4;
+            } else if (strcmp(op_str, "neg") == 0) { op_code = ASM_THUMB_FORMAT_4_NEG; goto op_format_4;
+            } else if (strcmp(op_str, "cmp") == 0) { op_code = ASM_THUMB_FORMAT_4_CMP; goto op_format_4;
+            } else if (strcmp(op_str, "cmn") == 0) { op_code = ASM_THUMB_FORMAT_4_CMN; goto op_format_4;
+            } else if (strcmp(op_str, "orr") == 0) { op_code = ASM_THUMB_FORMAT_4_ORR; goto op_format_4;
+            } else if (strcmp(op_str, "mul") == 0) { op_code = ASM_THUMB_FORMAT_4_MUL; goto op_format_4;
+            } else if (strcmp(op_str, "bic") == 0) { op_code = ASM_THUMB_FORMAT_4_BIC; goto op_format_4;
+            } else if (strcmp(op_str, "mvn") == 0) { op_code = ASM_THUMB_FORMAT_4_MVN; goto op_format_4;
+            } else {
+                goto unknown_op;
+            }
         } else {
-            goto unknown_op;
+            // second arg is not a register
+            uint op_code;
+            if (strcmp(op_str, "mov") == 0) {
+                op_code = ASM_THUMB_FORMAT_3_MOV;
+                uint rlo_dest, i8_src;
+                op_format_3:
+                rlo_dest = get_arg_reg(emit, op_str, pn_args[0], 7);
+                i8_src = get_arg_i(emit, op_str, pn_args[1], 0xff);
+                asm_thumb_format_3(emit->as, op_code, rlo_dest, i8_src);
+            } else if (strcmp(op_str, "cmp") == 0) {
+                op_code = ASM_THUMB_FORMAT_3_CMP;
+                goto op_format_3;
+            } else if (strcmp(op_str, "add") == 0) {
+                op_code = ASM_THUMB_FORMAT_3_ADD;
+                goto op_format_3;
+            } else if (strcmp(op_str, "sub") == 0) {
+                op_code = ASM_THUMB_FORMAT_3_SUB;
+                goto op_format_3;
+            } else if (strcmp(op_str, "movw") == 0) {
+                uint reg_dest = get_arg_reg(emit, op_str, pn_args[0], 15);
+                int i_src = get_arg_i(emit, op_str, pn_args[1], 0xffff);
+                asm_thumb_movw_reg_i16(emit->as, reg_dest, i_src);
+            } else if (strcmp(op_str, "movt") == 0) {
+                uint reg_dest = get_arg_reg(emit, op_str, pn_args[0], 15);
+                int i_src = get_arg_i(emit, op_str, pn_args[1], 0xffff);
+                asm_thumb_movt_reg_i16(emit->as, reg_dest, i_src);
+            } else if (strcmp(op_str, "movwt") == 0) {
+                // this is a convenience instruction
+                // we clear the MSB since it might be set from extracting the small int value
+                uint reg_dest = get_arg_reg(emit, op_str, pn_args[0], 15);
+                int i_src = get_arg_i(emit, op_str, pn_args[1], 0xffffffff);
+                asm_thumb_movw_reg_i16(emit->as, reg_dest, i_src & 0xffff);
+                asm_thumb_movt_reg_i16(emit->as, reg_dest, (i_src >> 16) & 0x7fff);
+            } else if (strcmp(op_str, "ldr") == 0) {
+                op_code = ASM_THUMB_FORMAT_9_LDR | ASM_THUMB_FORMAT_9_WORD_TRANSFER;
+                uint rlo_dest, rlo_base, i5;
+                mp_parse_node_t pn_base, pn_offset;
+                op_format_9_10:
+                rlo_dest = get_arg_reg(emit, op_str, pn_args[0], 7);
+                if (get_arg_addr(emit, op_str, pn_args[1], &pn_base, &pn_offset)) {
+                    rlo_base = get_arg_reg(emit, op_str, pn_base, 7);
+                    if (op_code & ASM_THUMB_FORMAT_9_BYTE_TRANSFER) {
+                        i5 = get_arg_i(emit, op_str, pn_offset, 0x1f);
+                    } else if (op_code & ASM_THUMB_FORMAT_10_STRH) { // also catches LDRH
+                        i5 = get_arg_i(emit, op_str, pn_offset, 0x3e) >> 1;
+                    } else {
+                        i5 = get_arg_i(emit, op_str, pn_offset, 0x7c) >> 2;
+                    }
+                    asm_thumb_format_9_10(emit->as, op_code, rlo_dest, rlo_base, i5);
+                }
+            } else if (strcmp(op_str, "ldrb") == 0) {
+                op_code = ASM_THUMB_FORMAT_9_LDR | ASM_THUMB_FORMAT_9_BYTE_TRANSFER;
+                goto op_format_9_10;
+            } else if (strcmp(op_str, "ldrh") == 0) {
+                op_code = ASM_THUMB_FORMAT_10_LDRH;
+                goto op_format_9_10;
+            } else if (strcmp(op_str, "str") == 0) {
+                op_code = ASM_THUMB_FORMAT_9_STR | ASM_THUMB_FORMAT_9_WORD_TRANSFER;
+                goto op_format_9_10;
+            } else if (strcmp(op_str, "strb") == 0) {
+                op_code = ASM_THUMB_FORMAT_9_STR | ASM_THUMB_FORMAT_9_BYTE_TRANSFER;
+                goto op_format_9_10;
+            } else if (strcmp(op_str, "strh") == 0) {
+                op_code = ASM_THUMB_FORMAT_10_STRH;
+                goto op_format_9_10;
+            } else {
+                goto unknown_op;
+            }
         }
 
     } else if (n_args == 3) {
+        uint op_code;
         if (strcmp(op_str, "add") == 0) {
-            uint rlo_dest = get_arg_reg(emit, op_str, pn_args, 0, 7);
-            uint rlo_src_a = get_arg_reg(emit, op_str, pn_args, 1, 7);
-            uint rlo_src_b = get_arg_reg(emit, op_str, pn_args, 2, 7);
-            asm_thumb_add_rlo_rlo_rlo(emit->as, rlo_dest, rlo_src_a, rlo_src_b);
-        } else if (strcmp(op_str, "subs") == 0) {
-            uint rlo_dest = get_arg_reg(emit, op_str, pn_args, 0, 7);
-            uint rlo_src = get_arg_reg(emit, op_str, pn_args, 1, 7);
-            int i3_src = get_arg_i(emit, op_str, pn_args, 2, 0x7);
-            asm_thumb_subs_rlo_rlo_i3(emit->as, rlo_dest, rlo_src, i3_src);
-        } else if (strcmp(op_str, "ldr") == 0) {
-            // TODO maybe use ldr(rd, [rb, 4]) syntax?
-            uint rlo_dest = get_arg_reg(emit, op_str, pn_args, 0, 7);
-            uint rlo_base = get_arg_reg(emit, op_str, pn_args, 1, 7);
-            int i5 = get_arg_i(emit, op_str, pn_args, 2, 0x7c);
-            asm_thumb_ldr_rlo_rlo_i5(emit->as, rlo_dest, rlo_base, i5 >> 2);
-        } else if (strcmp(op_str, "str") == 0) {
-            uint rlo_src = get_arg_reg(emit, op_str, pn_args, 0, 7);
-            uint rlo_base = get_arg_reg(emit, op_str, pn_args, 1, 7);
-            int i5 = get_arg_i(emit, op_str, pn_args, 2, 0x7c);
-            asm_thumb_str_rlo_rlo_i5(emit->as, rlo_src, rlo_base, i5 >> 2);
+            op_code = ASM_THUMB_FORMAT_2_ADD;
+            uint rlo_dest, rlo_src;
+            op_format_2:
+            rlo_dest = get_arg_reg(emit, op_str, pn_args[0], 7);
+            rlo_src = get_arg_reg(emit, op_str, pn_args[1], 7);
+            int src_b;
+            if (MP_PARSE_NODE_IS_ID(pn_args[2])) {
+                op_code |= ASM_THUMB_FORMAT_2_REG_OPERAND;
+                src_b = get_arg_reg(emit, op_str, pn_args[2], 7);
+            } else {
+                op_code |= ASM_THUMB_FORMAT_2_IMM_OPERAND;
+                src_b = get_arg_i(emit, op_str, pn_args[2], 0x7);
+            }
+            asm_thumb_format_2(emit->as, op_code, rlo_dest, rlo_src, src_b);
+        } else if (strcmp(op_str, "sub") == 0) {
+            op_code = ASM_THUMB_FORMAT_2_SUB;
+            goto op_format_2;
         } else {
             goto unknown_op;
         }
diff --git a/py/emitnative.c b/py/emitnative.c
index 29da5b8c996288b6d10322183bf1d03b38a4b975..b5a3acc23114b841902f7678e8f5d06042b14a35 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -1020,7 +1020,7 @@ STATIC void emit_native_for_iter(emit_t *emit, uint label) {
     asm_x64_cmp_r64_with_r64(emit->as, REG_RET, REG_TEMP1);
     asm_x64_jcc_label(emit->as, JCC_JE, label);
 #elif N_THUMB
-    asm_thumb_cmp_reg_reg(emit->as, REG_RET, REG_TEMP1);
+    asm_thumb_cmp_rlo_rlo(emit->as, REG_RET, REG_TEMP1);
     asm_thumb_bcc_label(emit->as, THUMB_CC_EQ, label);
 #endif
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
@@ -1067,10 +1067,10 @@ STATIC void emit_native_binary_op(emit_t *emit, mp_binary_op_t op) {
             asm_x64_cmp_r64_with_r64(emit->as, REG_ARG_3, REG_ARG_2);
             asm_x64_setcc_r8(emit->as, JCC_JL, REG_RET);
 #elif N_THUMB
-            asm_thumb_cmp_reg_reg(emit->as, REG_ARG_2, REG_ARG_3);
+            asm_thumb_cmp_rlo_rlo(emit->as, REG_ARG_2, REG_ARG_3);
             asm_thumb_ite_ge(emit->as);
-            asm_thumb_movs_rlo_i8(emit->as, REG_RET, 0); // if r0 >= r1
-            asm_thumb_movs_rlo_i8(emit->as, REG_RET, 1); // if r0 < r1
+            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 0); // if r0 >= r1
+            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 1); // if r0 < r1
 #endif
             emit_post_push_reg(emit, VTYPE_BOOL, REG_RET);
         } else {
diff --git a/py/objfun.c b/py/objfun.c
index dd4b7347ca9a7f1bfae31910b0cc0501dd010fd4..4ef92c0256d033f1e0a90dd2bc5960872dcb32f8 100644
--- a/py/objfun.c
+++ b/py/objfun.c
@@ -422,26 +422,35 @@ STATIC machine_uint_t convert_obj_for_inline_asm(mp_obj_t obj) {
         // pointer to the string (it's probably constant though!)
         uint l;
         return (machine_uint_t)mp_obj_str_get_data(obj, &l);
+    } else {
+        mp_obj_type_t *type = mp_obj_get_type(obj);
+        if (0) {
 #if MICROPY_ENABLE_FLOAT
-    } else if (MP_OBJ_IS_TYPE(obj, &mp_type_float)) {
-        // convert float to int (could also pass in float registers)
-        return (machine_int_t)mp_obj_float_get(obj);
+        } else if (type == &mp_type_float) {
+            // convert float to int (could also pass in float registers)
+            return (machine_int_t)mp_obj_float_get(obj);
 #endif
-    } else if (MP_OBJ_IS_TYPE(obj, &mp_type_tuple)) {
-        // pointer to start of tuple (could pass length, but then could use len(x) for that)
-        uint len;
-        mp_obj_t *items;
-        mp_obj_tuple_get(obj, &len, &items);
-        return (machine_uint_t)items;
-    } else if (MP_OBJ_IS_TYPE(obj, &mp_type_list)) {
-        // pointer to start of list (could pass length, but then could use len(x) for that)
-        uint len;
-        mp_obj_t *items;
-        mp_obj_list_get(obj, &len, &items);
-        return (machine_uint_t)items;
-    } else {
-        // just pass along a pointer to the object
-        return (machine_uint_t)obj;
+        } else if (type == &mp_type_tuple) {
+            // pointer to start of tuple (could pass length, but then could use len(x) for that)
+            uint len;
+            mp_obj_t *items;
+            mp_obj_tuple_get(obj, &len, &items);
+            return (machine_uint_t)items;
+        } else if (type == &mp_type_list) {
+            // pointer to start of list (could pass length, but then could use len(x) for that)
+            uint len;
+            mp_obj_t *items;
+            mp_obj_list_get(obj, &len, &items);
+            return (machine_uint_t)items;
+        } else if (type->buffer_p.get_buffer != NULL) {
+            // supports the buffer protocol, get a pointer to the data
+            buffer_info_t bufinfo;
+            type->buffer_p.get_buffer(obj, &bufinfo, BUFFER_READ);
+            return (machine_uint_t)bufinfo.buf;
+        } else {
+            // just pass along a pointer to the object
+            return (machine_uint_t)obj;
+        }
     }
 }