diff --git a/py/asmthumb.c b/py/asmthumb.c
index a21a3da3096ba93b509f980876901fbba8d04b8c..9e3a9abe25bdf0134d4874fda07300fa6d4053a7 100644
--- a/py/asmthumb.c
+++ b/py/asmthumb.c
@@ -166,15 +166,29 @@ STATIC void asm_thumb_write_word32(asm_thumb_t *as, int w32) {
 #define OP_ADD_SP(num_words) (0xb000 | (num_words))
 #define OP_SUB_SP(num_words) (0xb080 | (num_words))
 
+// locals:
+//  - stored on the stack in ascending order
+//  - numbered 0 through as->num_locals-1
+//  - SP points to first local
+//
+//  | SP
+//  v
+//  l0  l1  l2  ...  l(n-1)
+//  ^                ^
+//  | low address    | high address in RAM
+
 void asm_thumb_entry(asm_thumb_t *as, int num_locals) {
-    // work out what to push and how many extra space to reserve on stack
+    // work out what to push and how many extra spaces to reserve on stack
     // so that we have enough for all locals and it's aligned an 8-byte boundary
+    // we push extra regs (r1, r2, r3) to help do the stack adjustment
+    // we probably should just always subtract from sp, since this would be more efficient
+    // for push rlist, lowest numbered register at the lowest address
     uint reglist;
     uint stack_adjust;
     if (num_locals < 0) {
         num_locals = 0;
     }
-    // don't ppop r0 because it's used for return value
+    // don't pop r0 because it's used for return value
     switch (num_locals) {
         case 0:
             reglist = 0xf2;
@@ -398,14 +412,14 @@ void asm_thumb_mov_reg_i32_aligned(asm_thumb_t *as, uint reg_dest, int i32) {
 
 void asm_thumb_mov_local_reg(asm_thumb_t *as, int local_num, uint rlo_src) {
     assert(rlo_src < REG_R8);
-    int word_offset = as->num_locals - local_num - 1;
+    int word_offset = local_num;
     assert(as->pass < ASM_THUMB_PASS_EMIT || word_offset >= 0);
     asm_thumb_op16(as, OP_STR_TO_SP_OFFSET(rlo_src, word_offset));
 }
 
 void asm_thumb_mov_reg_local(asm_thumb_t *as, uint rlo_dest, int local_num) {
     assert(rlo_dest < REG_R8);
-    int word_offset = as->num_locals - local_num - 1;
+    int word_offset = local_num;
     assert(as->pass < ASM_THUMB_PASS_EMIT || word_offset >= 0);
     asm_thumb_op16(as, OP_LDR_FROM_SP_OFFSET(rlo_dest, word_offset));
 }
@@ -414,7 +428,7 @@ void asm_thumb_mov_reg_local(asm_thumb_t *as, uint rlo_dest, int local_num) {
 
 void asm_thumb_mov_reg_local_addr(asm_thumb_t *as, uint rlo_dest, int local_num) {
     assert(rlo_dest < REG_R8);
-    int word_offset = as->num_locals - local_num - 1;
+    int word_offset = local_num;
     assert(as->pass < ASM_THUMB_PASS_EMIT || word_offset >= 0);
     asm_thumb_op16(as, OP_ADD_REG_SP_OFFSET(rlo_dest, word_offset));
 }
diff --git a/py/emitnative.c b/py/emitnative.c
index acf27352e754c01f4b2b8f731579483593762b3e..4ce21e9c0e8fe5c18f033ee013409b20b1e7d6a7 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -260,9 +260,9 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         if (i == 0) {
             asm_x64_mov_r64_to_r64(emit->as, REG_ARG_1, REG_LOCAL_1);
         } else if (i == 1) {
-            asm_x64_mov_r64_to_local(emit->as, REG_ARG_2, i - 1);
+            asm_x64_mov_r64_to_local(emit->as, REG_ARG_2, i - REG_LOCAL_NUM);
         } else if (i == 2) {
-            asm_x64_mov_r64_to_local(emit->as, REG_ARG_3, i - 1);
+            asm_x64_mov_r64_to_local(emit->as, REG_ARG_3, i - REG_LOCAL_NUM);
         } else {
             // TODO not implemented
             assert(0);
@@ -739,7 +739,7 @@ STATIC void emit_native_load_fast(emit_t *emit, qstr qstr, uint id_flags, int lo
         emit_post_push_reg(emit, vtype, REG_LOCAL_1);
     } else {
         need_reg_single(emit, REG_RAX, 0);
-        asm_x64_mov_local_to_r64(emit->as, local_num - 1, REG_RAX);
+        asm_x64_mov_local_to_r64(emit->as, local_num - REG_LOCAL_NUM, REG_RAX);
         emit_post_push_reg(emit, vtype, REG_RAX);
     }
 #elif N_THUMB
@@ -751,7 +751,7 @@ STATIC void emit_native_load_fast(emit_t *emit, qstr qstr, uint id_flags, int lo
         emit_post_push_reg(emit, vtype, REG_LOCAL_3);
     } else {
         need_reg_single(emit, REG_R0, 0);
-        asm_thumb_mov_reg_local(emit->as, REG_R0, local_num - 1);
+        asm_thumb_mov_reg_local(emit->as, REG_R0, local_num - REG_LOCAL_NUM);
         emit_post_push_reg(emit, vtype, REG_R0);
     }
 #endif
@@ -820,7 +820,7 @@ STATIC void emit_native_store_fast(emit_t *emit, qstr qstr, int local_num) {
         emit_pre_pop_reg(emit, &vtype, REG_LOCAL_1);
     } else {
         emit_pre_pop_reg(emit, &vtype, REG_RAX);
-        asm_x64_mov_r64_to_local(emit->as, REG_RAX, local_num - 1);
+        asm_x64_mov_r64_to_local(emit->as, REG_RAX, local_num - REG_LOCAL_NUM);
     }
 #elif N_THUMB
     if (local_num == 0) {
@@ -831,7 +831,7 @@ STATIC void emit_native_store_fast(emit_t *emit, qstr qstr, int local_num) {
         emit_pre_pop_reg(emit, &vtype, REG_LOCAL_3);
     } else {
         emit_pre_pop_reg(emit, &vtype, REG_R0);
-        asm_thumb_mov_local_reg(emit->as, local_num - 1, REG_R0);
+        asm_thumb_mov_local_reg(emit->as, local_num - REG_LOCAL_NUM, REG_R0);
     }
 #endif
 
diff --git a/tests/pybnative/for.py b/tests/pybnative/for.py
new file mode 100644
index 0000000000000000000000000000000000000000..309c6c14fd075e401c6818131f8d5df7f64cef33
--- /dev/null
+++ b/tests/pybnative/for.py
@@ -0,0 +1,15 @@
+import pyb
+
+@micropython.native
+def f1(n):
+    for i in range(n):
+        print(i)
+
+f1(4)
+
+@micropython.native
+def f2(r):
+    for i in r:
+        print(i)
+
+f2(range(4))
diff --git a/tests/pybnative/for.py.exp b/tests/pybnative/for.py.exp
new file mode 100644
index 0000000000000000000000000000000000000000..d4dc73eff36bb5ad093444fdc04a3bea2b616372
--- /dev/null
+++ b/tests/pybnative/for.py.exp
@@ -0,0 +1,8 @@
+0
+1
+2
+3
+0
+1
+2
+3
diff --git a/tests/pybnative/while.py b/tests/pybnative/while.py
index a3f64a800c44f945be8d00a52cb20c935e954f88..3ea7221ea706a3189dde3cfc4677763682c92bab 100644
--- a/tests/pybnative/while.py
+++ b/tests/pybnative/while.py
@@ -1,14 +1,15 @@
+import pyb
+
 @micropython.native
-def f(led, n):
+def f(led, n, d):
     led.off()
     i = 0
     while i < n:
+        print(i)
         led.toggle()
-        d = pyb.delay
-        d(50) # pyb.delay(50) doesn't work!
+        pyb.delay(d)
         i += 1
-        print(i)
     led.off()
 
-f(pyb.LED(1), 2)
-f(pyb.LED(2), 4)
+f(pyb.LED(1), 2, 150)
+f(pyb.LED(2), 4, 50)
diff --git a/tests/pybnative/while.py.exp b/tests/pybnative/while.py.exp
index 5cd83db0ff1e1516307419013506e599dfc1fcb6..d95e7f145215d2fbab5a7fed6b19af129f6afe8a 100644
--- a/tests/pybnative/while.py.exp
+++ b/tests/pybnative/while.py.exp
@@ -1,6 +1,6 @@
+0
 1
-2
+0
 1
 2
 3
-4