diff --git a/py/asmx86.c b/py/asmx86.c
index a0a38161ca4e4bcc4b9b05aeda3b0e05ed2dee86..70d2bfe79d4eef6dd3e7e5d88653d8668f8509ac 100644
--- a/py/asmx86.c
+++ b/py/asmx86.c
@@ -54,8 +54,8 @@
 #define OPCODE_LEA_MEM_TO_R32    (0x8d) /* /r */
 #define OPCODE_XOR_R32_TO_RM32   (0x31) /* /r */
 #define OPCODE_ADD_R32_TO_RM32   (0x01)
-//#define OPCODE_ADD_I32_TO_RM32   (0x81) /* /0 */
-//#define OPCODE_ADD_I8_TO_RM32    (0x83) /* /0 */
+#define OPCODE_ADD_I32_TO_RM32   (0x81) /* /0 */
+#define OPCODE_ADD_I8_TO_RM32    (0x83) /* /0 */
 //#define OPCODE_SUB_R32_FROM_RM32 (0x29)
 #define OPCODE_SUB_I32_FROM_RM32 (0x81) /* /5 */
 #define OPCODE_SUB_I8_FROM_RM32  (0x83) /* /5 */
@@ -275,21 +275,17 @@ void asm_x86_add_r32_to_r32(asm_x86_t *as, int src_r32, int dest_r32) {
     asm_x86_write_byte_2(as, OPCODE_ADD_R32_TO_RM32, MODRM_R32(src_r32) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
 }
 
-#if 0
-void asm_x86_add_i32_to_r32(asm_x86_t *as, int src_i32, int dest_r32)
-{
-    if (SIGNED_FIT8(src_i32))
-    {
+void asm_x86_add_i32_to_r32(asm_x86_t *as, int src_i32, int dest_r32) {
+    if (SIGNED_FIT8(src_i32)) {
         asm_x86_write_byte_2(as, OPCODE_ADD_I8_TO_RM32, MODRM_R32(0) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
         asm_x86_write_byte_1(as, src_i32 & 0xff);
-    }
-    else
-    {
+    } else {
         asm_x86_write_byte_2(as, OPCODE_ADD_I32_TO_RM32, MODRM_R32(0) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
         asm_x86_write_word32(as, src_i32);
     }
 }
 
+#if 0
 void asm_x86_sub_r32_from_r32(asm_x86_t *as, int src_r32, int dest_r32) {
     asm_x86_write_byte_2(as, OPCODE_SUB_R32_FROM_RM32, MODRM_R32(src_r32) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
 }
@@ -419,10 +415,15 @@ void asm_x86_entry(asm_x86_t *as, mp_uint_t num_locals) {
     asm_x86_mov_r32_to_r32(as, REG_ESP, REG_EBP);
     asm_x86_sub_i32_from_r32(as, num_locals * WORD_SIZE, REG_ESP);
     asm_x86_push_r32(as, REG_EBX);
+    asm_x86_push_r32(as, REG_ESI);
+    asm_x86_push_r32(as, REG_EDI);
+    // TODO align stack on 16-byte boundary
     as->num_locals = num_locals;
 }
 
 void asm_x86_exit(asm_x86_t *as) {
+    asm_x86_pop_r32(as, REG_EDI);
+    asm_x86_pop_r32(as, REG_ESI);
     asm_x86_pop_r32(as, REG_EBX);
     asm_x86_write_byte_1(as, OPCODE_LEAVE);
     asm_x86_ret(as);
@@ -430,18 +431,17 @@ void asm_x86_exit(asm_x86_t *as) {
 
 #if 0
 void asm_x86_push_arg(asm_x86_t *as, int src_arg_num) {
-    assert(0);
-    asm_x86_push_disp(as, REG_EBP, 8 + src_arg_num * WORD_SIZE);
+    asm_x86_push_disp(as, REG_EBP, 2 * WORD_SIZE + src_arg_num * WORD_SIZE);
 }
+#endif
 
 void asm_x86_mov_arg_to_r32(asm_x86_t *as, int src_arg_num, int dest_r32) {
-    assert(0);
-    //asm_x86_mov_disp_to_r32(as, REG_EBP, 8 + src_arg_num * WORD_SIZE, dest_r32);
+    asm_x86_mov_disp_to_r32(as, REG_EBP, 2 * WORD_SIZE + src_arg_num * WORD_SIZE, dest_r32);
 }
 
+#if 0
 void asm_x86_mov_r32_to_arg(asm_x86_t *as, int src_r32, int dest_arg_num) {
-    assert(0);
-    //asm_x86_mov_r32_to_disp(as, src_r32, REG_EBP, 8 + dest_arg_num * WORD_SIZE);
+    asm_x86_mov_r32_to_disp(as, src_r32, REG_EBP, 2 * WORD_SIZE + dest_arg_num * WORD_SIZE);
 }
 #endif
 
@@ -491,6 +491,7 @@ void asm_x86_push_local_addr(asm_x86_t *as, int local_num, int temp_r32)
 #endif
 
 void asm_x86_call_ind(asm_x86_t *as, void *ptr, mp_uint_t n_args, int temp_r32) {
+    // TODO align stack on 16-byte boundary before the call
     assert(n_args <= 3);
     if (n_args > 2) {
         asm_x86_push_r32(as, REG_ARG_3);
@@ -515,6 +516,11 @@ void asm_x86_call_ind(asm_x86_t *as, void *ptr, mp_uint_t n_args, int temp_r32)
     asm_x86_write_byte_1(as, OPCODE_CALL_REL32);
     asm_x86_write_word32(as, ptr - (void*)(as->code_base + as->code_offset + 4));
     */
+
+    // the caller must clean up the stack
+    if (n_args > 0) {
+        asm_x86_add_i32_to_r32(as, WORD_SIZE * n_args, REG_ESP);
+    }
 }
 
 #endif // MICROPY_EMIT_X86
diff --git a/py/asmx86.h b/py/asmx86.h
index 5d0fc70e3d32269ee36f42280ad3d698079fbdfa..1f4cfaf55242c4ea3ec012c7c15eacdadcf0f2df 100644
--- a/py/asmx86.h
+++ b/py/asmx86.h
@@ -24,6 +24,14 @@
  * THE SOFTWARE.
  */
 
+// x86 cdecl calling convention is:
+//  - args passed on the stack in reverse order
+//  - return value in EAX
+//  - caller cleans up the stack after a call
+//  - stack must be aligned to 16-byte boundary before all calls
+//  - EAX, ECX, EDX are caller-save
+//  - EBX, ESI, EDI, EBP, ESP, EIP are callee-save
+
 #define ASM_X86_PASS_COMPUTE (1)
 #define ASM_X86_PASS_EMIT    (2)
 
@@ -45,8 +53,8 @@
 #define ASM_X86_CC_JL  (0xc) // less, signed
 
 #define REG_RET REG_EAX
-#define REG_ARG_1 REG_EDI
-#define REG_ARG_2 REG_ESI
+#define REG_ARG_1 REG_EBX
+#define REG_ARG_2 REG_ECX
 #define REG_ARG_3 REG_EDX
 
 typedef struct _asm_x86_t asm_x86_t;
@@ -71,6 +79,7 @@ void asm_x86_jmp_label(asm_x86_t* as, mp_uint_t label);
 void asm_x86_jcc_label(asm_x86_t* as, mp_uint_t jcc_type, mp_uint_t label);
 void asm_x86_entry(asm_x86_t* as, mp_uint_t num_locals);
 void asm_x86_exit(asm_x86_t* as);
+void asm_x86_mov_arg_to_r32(asm_x86_t *as, int src_arg_num, int dest_r32);
 void asm_x86_mov_local_to_r32(asm_x86_t* as, int src_local_num, int dest_r32);
 void asm_x86_mov_r32_to_local(asm_x86_t* as, int src_r32, int dest_local_num);
 void asm_x86_mov_local_addr_to_r32(asm_x86_t* as, int local_num, int dest_r32);
diff --git a/py/emitnative.c b/py/emitnative.c
index 782f4b60dd311e4f7a6abf8ea30351780cb18ed7..6afdf130a157cc41901a99ba2ba295d2b6359bf8 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -190,11 +190,12 @@ STATIC byte mp_f_n_args[MP_F_NUMBER_OF] = {
 #define EXPORT_FUN(name) emit_native_x86_##name
 
 #define REG_TEMP0 (REG_EAX)
-#define REG_TEMP1 (REG_EDI)
-#define REG_TEMP2 (REG_ESI)
+#define REG_TEMP1 (REG_EBX)
+#define REG_TEMP2 (REG_ECX)
 
-#define REG_LOCAL_1 (REG_EBX)
-#define REG_LOCAL_NUM (1)
+#define REG_LOCAL_1 (REG_ESI)
+#define REG_LOCAL_2 (REG_EDI)
+#define REG_LOCAL_NUM (2)
 
 #define ASM_PASS_COMPUTE    ASM_X86_PASS_COMPUTE
 #define ASM_PASS_EMIT       ASM_X86_PASS_EMIT
@@ -523,17 +524,13 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
     }
 #elif N_X86
     for (int i = 0; i < scope->num_pos_args; i++) {
-        // TODO
-        assert(0);
         if (i == 0) {
-            asm_x86_mov_r32_to_r32(emit->as, REG_ARG_1, REG_LOCAL_1);
+            asm_x86_mov_arg_to_r32(emit->as, i, REG_LOCAL_1);
         } else if (i == 1) {
-            asm_x86_mov_r32_to_local(emit->as, REG_ARG_2, i - REG_LOCAL_NUM);
-        } else if (i == 2) {
-            asm_x86_mov_r32_to_local(emit->as, REG_ARG_3, i - REG_LOCAL_NUM);
+            asm_x86_mov_arg_to_r32(emit->as, i, REG_LOCAL_2);
         } else {
-            // TODO not implemented
-            assert(0);
+            asm_x86_mov_arg_to_r32(emit->as, i, REG_TEMP0);
+            asm_x86_mov_r32_to_local(emit->as, REG_TEMP0, i - REG_LOCAL_NUM);
         }
     }
 #elif N_THUMB
@@ -1023,6 +1020,8 @@ STATIC void emit_native_load_fast(emit_t *emit, qstr qstr, uint id_flags, int lo
 #elif N_X86
     if (local_num == 0) {
         emit_post_push_reg(emit, vtype, REG_LOCAL_1);
+    } else if (local_num == 1) {
+        emit_post_push_reg(emit, vtype, REG_LOCAL_2);
     } else {
         need_reg_single(emit, REG_EAX, 0);
         asm_x86_mov_local_to_r32(emit->as, local_num - REG_LOCAL_NUM, REG_EAX);
@@ -1124,6 +1123,8 @@ STATIC void emit_native_store_fast(emit_t *emit, qstr qstr, int local_num) {
 #elif N_X86
     if (local_num == 0) {
         emit_pre_pop_reg(emit, &vtype, REG_LOCAL_1);
+    } else if (local_num == 1) {
+        emit_pre_pop_reg(emit, &vtype, REG_LOCAL_2);
     } else {
         emit_pre_pop_reg(emit, &vtype, REG_EAX);
         asm_x86_mov_r32_to_local(emit->as, REG_EAX, local_num - REG_LOCAL_NUM);