From cd82e02e84df5f9f2f3082d865beae25217af2a1 Mon Sep 17 00:00:00 2001
From: Damien George <damien.p.george@gmail.com>
Date: Sun, 2 Feb 2014 13:11:48 +0000
Subject: [PATCH] py: Partially fix native emitter to work with latest runtime.

Native emitter has been broken since stack order has changed from
reverse to standard.  This fix gets it partially working.
---
 py/asmx64.c     | 146 +++++++++++++++++++++++++-----------------------
 py/emitnative.c |  60 +++++++++++++-------
 py/objtype.c    |   2 +-
 py/runtime.c    |  17 +++---
 py/runtime.h    |   1 +
 py/runtime0.h   |   6 +-
 6 files changed, 129 insertions(+), 103 deletions(-)

diff --git a/py/asmx64.c b/py/asmx64.c
index de3433248..197ccd883 100644
--- a/py/asmx64.c
+++ b/py/asmx64.c
@@ -94,6 +94,7 @@ struct _asm_x64_t {
 
     uint max_num_labels;
     int *label_offsets;
+    int num_locals;
 };
 
 // for allocating memory, see src/v8/src/platform-linux.cc
@@ -108,8 +109,8 @@ void *alloc_mem(uint req_size, uint *alloc_size, bool is_exec) {
     return ptr;
 }
 
-asm_x64_t* asm_x64_new(uint max_num_labels) {
-    asm_x64_t* as;
+asm_x64_t *asm_x64_new(uint max_num_labels) {
+    asm_x64_t *as;
 
     as = m_new(asm_x64_t, 1);
     as->pass = 0;
@@ -118,11 +119,12 @@ asm_x64_t* asm_x64_new(uint max_num_labels) {
     as->code_base = NULL;
     as->max_num_labels = max_num_labels;
     as->label_offsets = m_new(int, max_num_labels);
+    as->num_locals = 0;
 
     return as;
 }
 
-void asm_x64_free(asm_x64_t* as, bool free_code) {
+void asm_x64_free(asm_x64_t *as, bool free_code) {
     if (free_code) {
         // need to un-mmap
         //m_free(as->code_base);
@@ -174,7 +176,7 @@ void asm_x64_end_pass(asm_x64_t *as) {
 }
 
 // all functions must go through this one to emit bytes
-static byte* asm_x64_get_cur_to_write_bytes(asm_x64_t* as, int num_bytes_to_write) {
+static byte *asm_x64_get_cur_to_write_bytes(asm_x64_t *as, int num_bytes_to_write) {
     //printf("emit %d\n", num_bytes_to_write);
     if (as->pass < ASM_X64_PASS_3) {
         as->code_offset += num_bytes_to_write;
@@ -187,33 +189,33 @@ static byte* asm_x64_get_cur_to_write_bytes(asm_x64_t* as, int num_bytes_to_writ
     }
 }
 
-uint asm_x64_get_code_size(asm_x64_t* as) {
+uint asm_x64_get_code_size(asm_x64_t *as) {
     return as->code_size;
 }
 
-void* asm_x64_get_code(asm_x64_t* as) {
+void *asm_x64_get_code(asm_x64_t *as) {
     return as->code_base;
 }
 
-static void asm_x64_write_byte_1(asm_x64_t* as, byte b1) {
+static void asm_x64_write_byte_1(asm_x64_t *as, byte b1) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 1);
     c[0] = b1;
 }
 
-static void asm_x64_write_byte_2(asm_x64_t* as, byte b1, byte b2) {
+static void asm_x64_write_byte_2(asm_x64_t *as, byte b1, byte b2) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 2);
     c[0] = b1;
     c[1] = b2;
 }
 
-static void asm_x64_write_byte_3(asm_x64_t* as, byte b1, byte b2, byte b3) {
+static void asm_x64_write_byte_3(asm_x64_t *as, byte b1, byte b2, byte b3) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 3);
     c[0] = b1;
     c[1] = b2;
     c[2] = b3;
 }
 
-static void asm_x64_write_word32(asm_x64_t* as, int w32) {
+static void asm_x64_write_word32(asm_x64_t *as, int w32) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 4);
     c[0] = IMM32_L0(w32);
     c[1] = IMM32_L1(w32);
@@ -221,7 +223,7 @@ static void asm_x64_write_word32(asm_x64_t* as, int w32) {
     c[3] = IMM32_L3(w32);
 }
 
-static void asm_x64_write_word64(asm_x64_t* as, int64_t w64) {
+static void asm_x64_write_word64(asm_x64_t *as, int64_t w64) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 8);
     c[0] = IMM32_L0(w64);
     c[1] = IMM32_L1(w64);
@@ -234,7 +236,7 @@ static void asm_x64_write_word64(asm_x64_t* as, int64_t w64) {
 }
 
 /* unused
-static void asm_x64_write_word32_to(asm_x64_t* as, int offset, int w32) {
+static void asm_x64_write_word32_to(asm_x64_t *as, int offset, int w32) {
     byte* c;
     assert(offset + 4 <= as->code_size);
     c = as->code_base + offset;
@@ -245,7 +247,7 @@ static void asm_x64_write_word32_to(asm_x64_t* as, int offset, int w32) {
 }
 */
 
-static void asm_x64_write_r64_disp(asm_x64_t* as, int r64, int disp_r64, int disp_offset) {
+static void asm_x64_write_r64_disp(asm_x64_t *as, int r64, int disp_r64, int disp_offset) {
     assert(disp_r64 != REG_RSP);
 
     if (disp_offset == 0 && disp_r64 != REG_RBP) {
@@ -258,60 +260,55 @@ static void asm_x64_write_r64_disp(asm_x64_t* as, int r64, int disp_r64, int dis
     }
 }
 
-void asm_x64_nop(asm_x64_t* as)
-{
+void asm_x64_nop(asm_x64_t *as) {
     asm_x64_write_byte_1(as, OPCODE_NOP);
 }
 
-void asm_x64_push_r64(asm_x64_t* as, int src_r64)
-{
+void asm_x64_push_r64(asm_x64_t *as, int src_r64) {
     asm_x64_write_byte_1(as, OPCODE_PUSH_R64 | src_r64);
 }
 
-void asm_x64_push_i32(asm_x64_t* as, int src_i32)
-{
+void asm_x64_push_i32(asm_x64_t *as, int src_i32) {
     asm_x64_write_byte_1(as, OPCODE_PUSH_I64);
     asm_x64_write_word32(as, src_i32); // will be sign extended to 64 bits
 }
 
-void asm_x64_push_disp(asm_x64_t* as, int src_r64, int src_offset) {
+void asm_x64_push_disp(asm_x64_t *as, int src_r64, int src_offset) {
     asm_x64_write_byte_1(as, OPCODE_PUSH_M64);
     asm_x64_write_r64_disp(as, 6, src_r64, src_offset);
 }
 
-void asm_x64_pop_r64(asm_x64_t* as, int dest_r64)
-{
+void asm_x64_pop_r64(asm_x64_t *as, int dest_r64) {
     asm_x64_write_byte_1(as, OPCODE_POP_R64 | dest_r64);
 }
 
-static void asm_x64_ret(asm_x64_t* as)
-{
+static void asm_x64_ret(asm_x64_t *as) {
     asm_x64_write_byte_1(as, OPCODE_RET);
 }
 
-void asm_x64_mov_r32_to_r32(asm_x64_t* as, int src_r32, int dest_r32) {
+void asm_x64_mov_r32_to_r32(asm_x64_t *as, int src_r32, int dest_r32) {
     // defaults to 32 bit operation
     asm_x64_write_byte_2(as, OPCODE_MOV_R64_TO_RM64, MODRM_R64(src_r32) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
 }
 
-void asm_x64_mov_r64_to_r64(asm_x64_t* as, int src_r64, int dest_r64) {
+void asm_x64_mov_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_MOV_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_mov_r64_to_disp(asm_x64_t* as, int src_r64, int dest_r64, int dest_disp) {
+void asm_x64_mov_r64_to_disp(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_MOV_R64_TO_RM64);
     asm_x64_write_r64_disp(as, src_r64, dest_r64, dest_disp);
 }
 
-void asm_x64_mov_disp_to_r64(asm_x64_t* as, int src_r64, int src_disp, int dest_r64) {
+void asm_x64_mov_disp_to_r64(asm_x64_t *as, int src_r64, int src_disp, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_MOV_RM64_TO_R64);
     asm_x64_write_r64_disp(as, dest_r64, src_r64, src_disp);
 }
 
-void asm_x64_lea_disp_to_r64(asm_x64_t* as, int src_r64, int src_disp, int dest_r64) {
+void asm_x64_lea_disp_to_r64(asm_x64_t *as, int src_r64, int src_disp, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_LEA_MEM_TO_R64);
     asm_x64_write_r64_disp(as, dest_r64, src_r64, src_disp);
@@ -321,13 +318,13 @@ void asm_x64_mov_i8_to_r8(asm_x64_t *as, int src_i8, int dest_r64) {
     asm_x64_write_byte_2(as, OPCODE_MOV_I8_TO_R8 | dest_r64, src_i8);
 }
 
-void asm_x64_mov_i32_to_r64(asm_x64_t* as, int src_i32, int dest_r64) {
+void asm_x64_mov_i32_to_r64(asm_x64_t *as, int src_i32, int dest_r64) {
     // cpu defaults to i32 to r64, with zero extension
     asm_x64_write_byte_1(as, OPCODE_MOV_I64_TO_R64 | dest_r64);
     asm_x64_write_word32(as, src_i32);
 }
 
-void asm_x64_mov_i64_to_r64(asm_x64_t* as, int64_t src_i64, int dest_r64) {
+void asm_x64_mov_i64_to_r64(asm_x64_t *as, int64_t src_i64, int dest_r64) {
     // cpu defaults to i32 to r64
     // to mov i64 to r64 need to use REX prefix
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_MOV_I64_TO_R64 | dest_r64);
@@ -344,7 +341,7 @@ void asm_x64_mov_i64_to_r64_optimised(asm_x64_t *as, int64_t src_i64, int dest_r
     }
 }
 
-void asm_x64_mov_i32_to_disp(asm_x64_t* as, int src_i32, int dest_r32, int dest_disp)
+void asm_x64_mov_i32_to_disp(asm_x64_t *as, int src_i32, int dest_r32, int dest_disp)
 {
     assert(0);
     asm_x64_write_byte_1(as, OPCODE_MOV_I32_TO_RM32);
@@ -356,11 +353,11 @@ void asm_x64_xor_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_XOR_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_add_r64_to_r64(asm_x64_t* as, int src_r64, int dest_r64) {
+void asm_x64_add_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_ADD_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_add_i32_to_r32(asm_x64_t* as, int src_i32, int dest_r32)
+void asm_x64_add_i32_to_r32(asm_x64_t *as, int src_i32, int dest_r32)
 {
     assert(dest_r32 != REG_RSP); // in this case i think src_i32 must be 64 bits
     if (SIGNED_FIT8(src_i32))
@@ -375,17 +372,17 @@ void asm_x64_add_i32_to_r32(asm_x64_t* as, int src_i32, int dest_r32)
     }
 }
 
-void asm_x64_sub_r32_from_r32(asm_x64_t* as, int src_r32, int dest_r32) {
+void asm_x64_sub_r32_from_r32(asm_x64_t *as, int src_r32, int dest_r32) {
     // defaults to 32 bit operation
     asm_x64_write_byte_2(as, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r32) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
 }
 
-void asm_x64_sub_r64_from_r64(asm_x64_t* as, int src_r64, int dest_r64) {
+void asm_x64_sub_r64_from_r64(asm_x64_t *as, int src_r64, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_sub_i32_from_r32(asm_x64_t* as, int src_i32, int dest_r32) {
+void asm_x64_sub_i32_from_r32(asm_x64_t *as, int src_i32, int dest_r32) {
     if (SIGNED_FIT8(src_i32)) {
         // defaults to 32 bit operation
         asm_x64_write_byte_2(as, OPCODE_SUB_I8_FROM_RM64, MODRM_R64(5) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
@@ -397,7 +394,7 @@ void asm_x64_sub_i32_from_r32(asm_x64_t* as, int src_i32, int dest_r32) {
     }
 }
 
-void asm_x64_sub_i32_from_r64(asm_x64_t* as, int src_i32, int dest_r64) {
+void asm_x64_sub_i32_from_r64(asm_x64_t *as, int src_i32, int dest_r64) {
     if (SIGNED_FIT8(src_i32)) {
         // use REX prefix for 64 bit operation
         asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_SUB_I8_FROM_RM64, MODRM_R64(5) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
@@ -410,38 +407,38 @@ void asm_x64_sub_i32_from_r64(asm_x64_t* as, int src_i32, int dest_r64) {
 }
 
 /* shifts not tested */
-void asm_x64_shl_r32_by_imm(asm_x64_t* as, int r32, int imm) {
+void asm_x64_shl_r32_by_imm(asm_x64_t *as, int r32, int imm) {
     asm_x64_write_byte_2(as, OPCODE_SHL_RM32_BY_I8, MODRM_R64(4) | MODRM_RM_REG | MODRM_RM_R64(r32));
     asm_x64_write_byte_1(as, imm);
 }
 
-void asm_x64_shr_r32_by_imm(asm_x64_t* as, int r32, int imm) {
+void asm_x64_shr_r32_by_imm(asm_x64_t *as, int r32, int imm) {
     asm_x64_write_byte_2(as, OPCODE_SHR_RM32_BY_I8, MODRM_R64(5) | MODRM_RM_REG | MODRM_RM_R64(r32));
     asm_x64_write_byte_1(as, imm);
 }
 
-void asm_x64_sar_r32_by_imm(asm_x64_t* as, int r32, int imm) {
+void asm_x64_sar_r32_by_imm(asm_x64_t *as, int r32, int imm) {
     asm_x64_write_byte_2(as, OPCODE_SAR_RM32_BY_I8, MODRM_R64(7) | MODRM_RM_REG | MODRM_RM_R64(r32));
     asm_x64_write_byte_1(as, imm);
 }
 
-void asm_x64_cmp_r64_with_r64(asm_x64_t* as, int src_r64_a, int src_r64_b) {
+void asm_x64_cmp_r64_with_r64(asm_x64_t *as, int src_r64_a, int src_r64_b) {
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_CMP_R64_WITH_RM64, MODRM_R64(src_r64_a) | MODRM_RM_REG | MODRM_RM_R64(src_r64_b));
 }
 
-void asm_x64_cmp_r32_with_disp(asm_x64_t* as, int src_r32_a, int src_r32_b, int src_disp_b) {
+void asm_x64_cmp_r32_with_disp(asm_x64_t *as, int src_r32_a, int src_r32_b, int src_disp_b) {
     assert(0);
     asm_x64_write_byte_1(as, OPCODE_CMP_R64_WITH_RM64);
     //asm_x64_write_r32_disp(as, src_r32_a, src_r32_b, src_disp_b);
 }
 
-void asm_x64_cmp_disp_with_r32(asm_x64_t* as, int src_r32_a, int src_disp_a, int src_r32_b) {
+void asm_x64_cmp_disp_with_r32(asm_x64_t *as, int src_r32_a, int src_disp_a, int src_r32_b) {
     assert(0);
     asm_x64_write_byte_1(as, OPCODE_CMP_RM32_WITH_R32);
     //asm_x64_write_r32_disp(as, src_r32_b, src_r32_a, src_disp_a);
 }
 
-void asm_x64_cmp_i32_with_r32(asm_x64_t* as, int src_i32, int src_r32) {
+void asm_x64_cmp_i32_with_r32(asm_x64_t *as, int src_i32, int src_r32) {
     if (SIGNED_FIT8(src_i32)) {
         asm_x64_write_byte_2(as, OPCODE_CMP_I8_WITH_RM32, MODRM_R64(7) | MODRM_RM_REG | MODRM_RM_R64(src_r32));
         asm_x64_write_byte_1(as, src_i32 & 0xff);
@@ -451,18 +448,18 @@ void asm_x64_cmp_i32_with_r32(asm_x64_t* as, int src_i32, int src_r32) {
     }
 }
 
-void asm_x64_test_r8_with_r8(asm_x64_t* as, int src_r64_a, int src_r64_b) {
+void asm_x64_test_r8_with_r8(asm_x64_t *as, int src_r64_a, int src_r64_b) {
     // TODO implement for other registers
     assert(src_r64_a == REG_RAX);
     assert(src_r64_b == REG_RAX);
     asm_x64_write_byte_2(as, OPCODE_TEST_R8_WITH_RM8, MODRM_R64(src_r64_a) | MODRM_RM_REG | MODRM_RM_R64(src_r64_b));
 }
 
-void asm_x64_setcc_r8(asm_x64_t* as, int jcc_type, int dest_r8) {
+void asm_x64_setcc_r8(asm_x64_t *as, int jcc_type, int dest_r8) {
     asm_x64_write_byte_3(as, OPCODE_SETCC_RM8_A, OPCODE_SETCC_RM8_B | jcc_type, MODRM_R64(0) | MODRM_RM_REG | MODRM_RM_R64(dest_r8));
 }
 
-void asm_x64_label_assign(asm_x64_t* as, int label) {
+void asm_x64_label_assign(asm_x64_t *as, int label) {
     assert(label < as->max_num_labels);
     if (as->pass == ASM_X64_PASS_2) {
         // assign label offset
@@ -524,7 +521,7 @@ void asm_x64_jcc_label(asm_x64_t *as, int jcc_type, int label) {
     }
 }
 
-void asm_x64_entry(asm_x64_t* as, int num_locals) {
+void asm_x64_entry(asm_x64_t *as, int num_locals) {
     asm_x64_push_r64(as, REG_RBP);
     asm_x64_mov_r64_to_r64(as, REG_RSP, REG_RBP);
     if (num_locals < 0) {
@@ -533,44 +530,55 @@ void asm_x64_entry(asm_x64_t* as, int num_locals) {
     num_locals |= 1; // make it odd so stack is aligned on 16 byte boundary
     asm_x64_sub_i32_from_r64(as, num_locals * WORD_SIZE, REG_RSP);
     asm_x64_push_r64(as, REG_RBX);
+    as->num_locals = num_locals;
 }
 
-void asm_x64_exit(asm_x64_t* as) {
+void asm_x64_exit(asm_x64_t *as) {
     asm_x64_pop_r64(as, REG_RBX);
     asm_x64_write_byte_1(as, OPCODE_LEAVE);
     asm_x64_ret(as);
 }
 
-void asm_x64_push_arg(asm_x64_t* as, int src_arg_num) {
+void asm_x64_push_arg(asm_x64_t *as, int src_arg_num) {
     assert(0);
     asm_x64_push_disp(as, REG_RBP, 8 + src_arg_num * WORD_SIZE);
 }
 
-void asm_x64_mov_arg_to_r32(asm_x64_t* as, int src_arg_num, int dest_r32) {
+void asm_x64_mov_arg_to_r32(asm_x64_t *as, int src_arg_num, int dest_r32) {
     assert(0);
     //asm_x64_mov_disp_to_r32(as, REG_RBP, 8 + src_arg_num * WORD_SIZE, dest_r32);
 }
 
-void asm_x64_mov_r32_to_arg(asm_x64_t* as, int src_r32, int dest_arg_num) {
+void asm_x64_mov_r32_to_arg(asm_x64_t *as, int src_r32, int dest_arg_num) {
     assert(0);
     //asm_x64_mov_r32_to_disp(as, src_r32, REG_RBP, 8 + dest_arg_num * WORD_SIZE);
 }
 
-static int asm_x64_local_offset_from_ebp(int local_num)
-{
-    return -(local_num + 1) * WORD_SIZE;
+// locals:
+//  - stored on the stack in ascending order
+//  - numbered 0 through as->num_locals-1
+//  - RBP points above the last local
+//
+//                          | RPB
+//                          v
+//  l0  l1  l2  ...  l(n-1)
+//  ^                ^
+//  | low address    | high address in RAM
+//
+static int asm_x64_local_offset_from_ebp(asm_x64_t *as, int local_num) {
+    return (-as->num_locals + local_num) * WORD_SIZE;
 }
 
-void asm_x64_mov_local_to_r64(asm_x64_t* as, int src_local_num, int dest_r64) {
-    asm_x64_mov_disp_to_r64(as, REG_RBP, asm_x64_local_offset_from_ebp(src_local_num), dest_r64);
+void asm_x64_mov_local_to_r64(asm_x64_t *as, int src_local_num, int dest_r64) {
+    asm_x64_mov_disp_to_r64(as, REG_RBP, asm_x64_local_offset_from_ebp(as, src_local_num), dest_r64);
 }
 
-void asm_x64_mov_r64_to_local(asm_x64_t* as, int src_r64, int dest_local_num) {
-    asm_x64_mov_r64_to_disp(as, src_r64, REG_RBP, asm_x64_local_offset_from_ebp(dest_local_num));
+void asm_x64_mov_r64_to_local(asm_x64_t *as, int src_r64, int dest_local_num) {
+    asm_x64_mov_r64_to_disp(as, src_r64, REG_RBP, asm_x64_local_offset_from_ebp(as, dest_local_num));
 }
 
-void asm_x64_mov_local_addr_to_r64(asm_x64_t* as, int local_num, int dest_r64) {
-    int offset = asm_x64_local_offset_from_ebp(local_num);
+void asm_x64_mov_local_addr_to_r64(asm_x64_t *as, int local_num, int dest_r64) {
+    int offset = asm_x64_local_offset_from_ebp(as, local_num);
     if (offset == 0) {
         asm_x64_mov_r64_to_r64(as, REG_RBP, dest_r64);
     } else {
@@ -578,21 +586,21 @@ void asm_x64_mov_local_addr_to_r64(asm_x64_t* as, int local_num, int dest_r64) {
     }
 }
 
-void asm_x64_push_local(asm_x64_t* as, int local_num) {
-    asm_x64_push_disp(as, REG_RBP, asm_x64_local_offset_from_ebp(local_num));
+void asm_x64_push_local(asm_x64_t *as, int local_num) {
+    asm_x64_push_disp(as, REG_RBP, asm_x64_local_offset_from_ebp(as, local_num));
 }
 
-void asm_x64_push_local_addr(asm_x64_t* as, int local_num, int temp_r64)
+void asm_x64_push_local_addr(asm_x64_t *as, int local_num, int temp_r64)
 {
     asm_x64_mov_r64_to_r64(as, REG_RBP, temp_r64);
-    asm_x64_add_i32_to_r32(as, asm_x64_local_offset_from_ebp(local_num), temp_r64);
+    asm_x64_add_i32_to_r32(as, asm_x64_local_offset_from_ebp(as, local_num), temp_r64);
     asm_x64_push_r64(as, temp_r64);
 }
 
 /*
    can't use these because code might be relocated when resized
 
-void asm_x64_call(asm_x64_t* as, void* func)
+void asm_x64_call(asm_x64_t *as, void* func)
 {
     asm_x64_sub_i32_from_r32(as, 8, REG_RSP);
     asm_x64_write_byte_1(as, OPCODE_CALL_REL32);
@@ -600,7 +608,7 @@ void asm_x64_call(asm_x64_t* as, void* func)
     asm_x64_mov_r64_to_r64(as, REG_RBP, REG_RSP);
 }
 
-void asm_x64_call_i1(asm_x64_t* as, void* func, int i1)
+void asm_x64_call_i1(asm_x64_t *as, void* func, int i1)
 {
     asm_x64_sub_i32_from_r32(as, 8, REG_RSP);
     asm_x64_sub_i32_from_r32(as, 12, REG_RSP);
@@ -612,7 +620,7 @@ void asm_x64_call_i1(asm_x64_t* as, void* func, int i1)
 }
 */
 
-void asm_x64_call_ind(asm_x64_t* as, void *ptr, int temp_r64) {
+void asm_x64_call_ind(asm_x64_t *as, void *ptr, int temp_r64) {
 #ifdef __LP64__
     asm_x64_mov_i64_to_r64_optimised(as, (int64_t)ptr, temp_r64);
 #else
diff --git a/py/emitnative.c b/py/emitnative.c
index 8968db4be..98d743a8b 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -210,6 +210,11 @@ static void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         emit->stack_start = num_locals;
         num_locals += scope->stack_size;
     }
+    if (pass == PASS_2) {
+        // XXX big hack to make sure we have some locals in PASS_2
+        // this is so that on PASS_2 the code emitted in x64 has the right size
+        num_locals += 2;
+    }
 #if N_X64
     asm_x64_entry(emit->as, num_locals);
 #elif N_THUMB
@@ -492,8 +497,8 @@ static void emit_get_stack_pointer_to_reg_for_pop(emit_t *emit, int reg_dest, in
         assert(si->kind == STACK_VALUE);
         assert(si->vtype == VTYPE_PYOBJ);
     }
-    ASM_MOV_LOCAL_ADDR_TO_REG(emit->stack_start + emit->stack_size - 1, reg_dest);
     adjust_stack(emit, -n_pop);
+    ASM_MOV_LOCAL_ADDR_TO_REG(emit->stack_start + emit->stack_size, reg_dest);
 }
 
 // vtype of all n_push objects is VTYPE_PYOBJ
@@ -503,7 +508,7 @@ static void emit_get_stack_pointer_to_reg_for_push(emit_t *emit, int reg_dest, i
         emit->stack_info[emit->stack_size + i].kind = STACK_VALUE;
         emit->stack_info[emit->stack_size + i].vtype = VTYPE_PYOBJ;
     }
-    ASM_MOV_LOCAL_ADDR_TO_REG(emit->stack_start + emit->stack_size + n_push - 1, reg_dest);
+    ASM_MOV_LOCAL_ADDR_TO_REG(emit->stack_start + emit->stack_size, reg_dest);
     adjust_stack(emit, n_push);
 }
 
@@ -526,6 +531,17 @@ static void emit_call_with_imm_arg(emit_t *emit, rt_fun_kind_t fun_kind, void *f
 #endif
 }
 
+static void emit_call_with_2_imm_args(emit_t *emit, rt_fun_kind_t fun_kind, void *fun, machine_int_t arg_val1, int arg_reg1, machine_int_t arg_val2, int arg_reg2) {
+    need_reg_all(emit);
+    ASM_MOV_IMM_TO_REG(arg_val1, arg_reg1);
+    ASM_MOV_IMM_TO_REG(arg_val2, arg_reg2);
+#if N_X64
+    asm_x64_call_ind(emit->as, fun, REG_RAX);
+#elif N_THUMB
+    asm_thumb_bl_ind(emit->as, rt_fun_table[fun_kind], fun_kind, REG_R3);
+#endif
+}
+
 static void emit_native_load_id(emit_t *emit, qstr qstr) {
     // check for built-ins
     if (strcmp(qstr_str(qstr), "v_int") == 0) {
@@ -1046,14 +1062,14 @@ static void emit_native_build_tuple(emit_t *emit, int n_args) {
     // for viper: call runtime, with types of args
     //   if wrapped in byte_array, or something, allocates memory and fills it
     emit_pre(emit);
-    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, n_args); // pointer to items in reverse order
+    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, n_args); // pointer to items
     emit_call_with_imm_arg(emit, RT_F_BUILD_TUPLE, rt_build_tuple, n_args, REG_ARG_1);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); // new tuple
 }
 
 static void emit_native_build_list(emit_t *emit, int n_args) {
     emit_pre(emit);
-    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, n_args); // pointer to items in reverse order
+    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, n_args); // pointer to items
     emit_call_with_imm_arg(emit, RT_F_BUILD_LIST, rt_build_list, n_args, REG_ARG_1);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); // new list
 }
@@ -1099,7 +1115,7 @@ static void emit_native_map_add(emit_t *emit, int map_index) {
 
 static void emit_native_build_set(emit_t *emit, int n_args) {
     emit_pre(emit);
-    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, n_args); // pointer to items in reverse order
+    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, n_args); // pointer to items
     emit_call_with_imm_arg(emit, RT_F_BUILD_SET, rt_build_set, n_args, REG_ARG_1);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); // new set
 }
@@ -1141,7 +1157,10 @@ static void emit_native_make_closure(emit_t *emit, scope_t *scope, int n_dict_pa
 static void emit_native_call_function(emit_t *emit, int n_positional, int n_keyword, bool have_star_arg, bool have_dbl_star_arg) {
     // call special viper runtime routine with type info for args, and wanted type info for return
     assert(n_keyword == 0 && !have_star_arg && !have_dbl_star_arg);
-    /*
+
+    /* we no longer have these _n specific call_function's
+     * they anyway push args into an array
+     * and they would take too much room in the native dispatch table
     if (n_positional == 0) {
         vtype_kind_t vtype_fun;
         emit_pre_pop_reg(emit, &vtype_fun, REG_ARG_1); // the function
@@ -1162,21 +1181,21 @@ static void emit_native_call_function(emit_t *emit, int n_positional, int n_keyw
         emit_call(emit, RT_F_CALL_FUNCTION_2, rt_call_function_2);
     } else {
     */
-        emit_pre(emit);
-        if (n_positional != 0) {
-            emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional); // pointer to args in reverse order
-        }
-        vtype_kind_t vtype_fun;
-        emit_pre_pop_reg(emit, &vtype_fun, REG_ARG_1); // the function
-        assert(vtype_fun == VTYPE_PYOBJ);
-        // XXX rt_call_function_n now merged with rt_call_function_n_kw
-        //emit_call_with_imm_arg(emit, RT_F_CALL_FUNCTION_N, rt_call_function_n, n_positional, REG_ARG_2);
-    //}
+
+    emit_pre(emit);
+    if (n_positional != 0) {
+        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional); // pointer to args
+    }
+    vtype_kind_t vtype_fun;
+    emit_pre_pop_reg(emit, &vtype_fun, REG_ARG_1); // the function
+    assert(vtype_fun == VTYPE_PYOBJ);
+    emit_call_with_imm_arg(emit, RT_F_CALL_FUNCTION_N_KW_FOR_NATIVE, rt_call_function_n_kw_for_native, n_positional, REG_ARG_2);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
 static void emit_native_call_method(emit_t *emit, int n_positional, int n_keyword, bool have_star_arg, bool have_dbl_star_arg) {
     assert(n_keyword == 0 && !have_star_arg && !have_dbl_star_arg);
+
     /*
     if (n_positional == 0) {
         vtype_kind_t vtype_meth, vtype_self;
@@ -1193,11 +1212,10 @@ static void emit_native_call_method(emit_t *emit, int n_positional, int n_keywor
         emit_call(emit, RT_F_CALL_METHOD_2, rt_call_method_2);
     } else {
     */
-        emit_pre(emit);
-        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, n_positional + 2); // pointer to items in reverse order, including meth and self
-        // XXX rt_call_method_n now merged with rt_call_method_n_kw
-        //emit_call_with_imm_arg(emit, RT_F_CALL_METHOD_N, rt_call_method_n, n_positional, REG_ARG_1);
-    //}
+
+    emit_pre(emit);
+    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional + 2); // pointer to items, including meth and self
+    emit_call_with_2_imm_args(emit, RT_F_CALL_METHOD_N_KW, rt_call_method_n_kw, n_positional, REG_ARG_1, n_keyword, REG_ARG_2);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
diff --git a/py/objtype.c b/py/objtype.c
index 601632534..67d4f5869 100644
--- a/py/objtype.c
+++ b/py/objtype.c
@@ -275,7 +275,7 @@ static mp_obj_t type_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp
             return mp_obj_new_type(mp_obj_str_get_str(args[0]), args[1], args[2]);
 
         default:
-            nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "type takes at 1 or 3 arguments"));
+            nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "type takes 1 or 3 arguments"));
     }
 }
 
diff --git a/py/runtime.c b/py/runtime.c
index 6f6e3c903..b524e6520 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -231,13 +231,6 @@ void rt_assign_byte_code(uint unique_code_id, byte *code, uint len, int n_args,
 #if MICROPY_DEBUG_PRINTERS
     mp_byte_code_print(code, len);
 #endif
-
-#ifdef WRITE_CODE
-    if (fp_write_code != NULL) {
-        fwrite(code, len, 1, fp_write_code);
-        fflush(fp_write_code);
-    }
-#endif
 #endif
 }
 
@@ -724,6 +717,12 @@ mp_obj_t rt_call_function_2(mp_obj_t fun, mp_obj_t arg1, mp_obj_t arg2) {
     return rt_call_function_n_kw(fun, 2, 0, args);
 }
 
+// wrapper that accepts n_args and n_kw in one argument
+// native emitter can only pass at most 3 arguments to a function
+mp_obj_t rt_call_function_n_kw_for_native(mp_obj_t fun_in, uint n_args_kw, const mp_obj_t *args) {
+    return rt_call_function_n_kw(fun_in, n_args_kw & 0xff, (n_args_kw >> 8) & 0xff, args);
+}
+
 // args contains, eg: arg0  arg1  key0  value0  key1  value1
 mp_obj_t rt_call_function_n_kw(mp_obj_t fun_in, uint n_args, uint n_kw, const mp_obj_t *args) {
     // TODO improve this: fun object can specify its type and we parse here the arguments,
@@ -998,6 +997,7 @@ void *const rt_fun_table[RT_F_NUMBER_OF] = {
     rt_store_subscr,
     rt_is_true,
     rt_unary_op,
+    rt_binary_op,
     rt_build_tuple,
     rt_build_list,
     rt_list_append,
@@ -1006,9 +1006,8 @@ void *const rt_fun_table[RT_F_NUMBER_OF] = {
     rt_build_set,
     rt_store_set,
     rt_make_function_from_id,
-    rt_call_function_n_kw,
+    rt_call_function_n_kw_for_native,
     rt_call_method_n_kw,
-    rt_binary_op,
     rt_getiter,
     rt_iternext,
 };
diff --git a/py/runtime.h b/py/runtime.h
index 10c262a60..aafe1a06a 100644
--- a/py/runtime.h
+++ b/py/runtime.h
@@ -20,6 +20,7 @@ mp_obj_t rt_make_closure_from_id(int unique_code_id, mp_obj_t closure_tuple);
 mp_obj_t rt_call_function_0(mp_obj_t fun);
 mp_obj_t rt_call_function_1(mp_obj_t fun, mp_obj_t arg);
 mp_obj_t rt_call_function_2(mp_obj_t fun, mp_obj_t arg1, mp_obj_t arg2);
+mp_obj_t rt_call_function_n_kw_for_native(mp_obj_t fun_in, uint n_args_kw, const mp_obj_t *args);
 mp_obj_t rt_call_function_n_kw(mp_obj_t fun, uint n_args, uint n_kw, const mp_obj_t *args);
 mp_obj_t rt_call_method_n_kw(uint n_args, uint n_kw, const mp_obj_t *args);
 mp_obj_t rt_build_tuple(int n_args, mp_obj_t *items);
diff --git a/py/runtime0.h b/py/runtime0.h
index 9edf7ec0e..33fd80e64 100644
--- a/py/runtime0.h
+++ b/py/runtime0.h
@@ -62,6 +62,7 @@ typedef enum {
     RT_F_STORE_SUBSCR,
     RT_F_IS_TRUE,
     RT_F_UNARY_OP,
+    RT_F_BINARY_OP,
     RT_F_BUILD_TUPLE,
     RT_F_BUILD_LIST,
     RT_F_LIST_APPEND,
@@ -70,9 +71,8 @@ typedef enum {
     RT_F_BUILD_SET,
     RT_F_STORE_SET,
     RT_F_MAKE_FUNCTION_FROM_ID,
-    RT_F_CALL_FUNCTION_N,
-    RT_F_CALL_METHOD_N,
-    RT_F_BINARY_OP,
+    RT_F_CALL_FUNCTION_N_KW_FOR_NATIVE,
+    RT_F_CALL_METHOD_N_KW,
     RT_F_GETITER,
     RT_F_ITERNEXT,
     RT_F_NUMBER_OF,
-- 
GitLab