From 3112cde9006809a1ffa7f19e96fa8ee28311f411 Mon Sep 17 00:00:00 2001
From: Damien George <damien.p.george@gmail.com>
Date: Mon, 29 Sep 2014 18:45:42 +0100
Subject: [PATCH] py: Implement more binary ops for viper emitter.

This included a bit of restructuring of the assembler backends.  Note
that the ARM backend is missing a few functions and won't compile.
---
 py/asmarm.c     |  12 ++-
 py/asmarm.h     |   3 +-
 py/asmx64.c     |  75 +++++++++--------
 py/asmx64.h     |  16 +++-
 py/asmx86.c     |  44 ++++++----
 py/asmx86.h     |  16 +++-
 py/emitnative.c | 211 +++++++++++++++++++++++++++++++++++-------------
 7 files changed, 260 insertions(+), 117 deletions(-)

diff --git a/py/asmarm.c b/py/asmarm.c
index 0d5568692..8dbd9ad20 100644
--- a/py/asmarm.c
+++ b/py/asmarm.c
@@ -170,6 +170,11 @@ STATIC uint asm_arm_op_sub_imm(uint rd, uint rn, uint imm) {
     return 0x2400000 | (rn << 16) | (rd << 12) | (imm & 0xFF);
 }
 
+STATIC uint asm_arm_op_sub_reg(uint rd, uint rn, uint rm) {
+    // sub rd, rn, rm
+    return 0x0400000 | (rn << 16) | (rd << 12) | rm;
+}
+
 void asm_arm_bkpt(asm_arm_t *as) {
     // bkpt #0
     emit_al(as, 0x1200070); 
@@ -298,11 +303,16 @@ void asm_arm_less_op(asm_arm_t *as, uint rd, uint rn, uint rm) {
     emit(as, asm_arm_op_mov_imm(rd, 0) | ASM_ARM_CC_GE); // movge rd, #0
 }
 
-void asm_arm_add_reg(asm_arm_t *as, uint rd, uint rn, uint rm) {
+void asm_arm_add_reg_reg_reg(asm_arm_t *as, uint rd, uint rn, uint rm) {
     // add rd, rn, rm
     emit_al(as, asm_arm_op_add_reg(rd, rn, rm));
 }
 
+void asm_arm_sub_reg_reg_reg(asm_arm_t *as, uint rd, uint rn, uint rm) {
+    // sub rd, rn, rm
+    emit_al(as, asm_arm_op_sub_reg(rd, rn, rm));
+}
+
 void asm_arm_mov_reg_local_addr(asm_arm_t *as, uint rd, int local_num) {
     // add rd, sp, #local_num*4
     emit_al(as, asm_arm_op_add_imm(rd, ASM_ARM_REG_SP, local_num << 2));
diff --git a/py/asmarm.h b/py/asmarm.h
index e0c8efe1f..d977102ba 100644
--- a/py/asmarm.h
+++ b/py/asmarm.h
@@ -89,7 +89,8 @@ void asm_arm_mov_reg_local(asm_arm_t *as, uint rd, int local_num);
 void asm_arm_cmp_reg_i8(asm_arm_t *as, uint rd, int imm);
 void asm_arm_cmp_reg_reg(asm_arm_t *as, uint rd, uint rn);
 void asm_arm_less_op(asm_arm_t *as, uint rd, uint rn, uint rm);
-void asm_arm_add_reg(asm_arm_t *as, uint rd, uint rn, uint rm);
+void asm_arm_add_reg_reg_reg(asm_arm_t *as, uint rd, uint rn, uint rm);
+void asm_arm_sub_reg_reg_reg(asm_arm_t *as, uint rd, uint rn, uint rm);
 void asm_arm_mov_reg_local_addr(asm_arm_t *as, uint rd, int local_num);
 
 void asm_arm_bcc_label(asm_arm_t *as, int cond, uint label);
diff --git a/py/asmx64.c b/py/asmx64.c
index 8d074dc40..3f111781f 100644
--- a/py/asmx64.c
+++ b/py/asmx64.c
@@ -54,19 +54,21 @@
 #define OPCODE_MOV_RM64_TO_R64   (0x8b)
 #define OPCODE_LEA_MEM_TO_R64    (0x8d) /* /r */
 #define OPCODE_XOR_R64_TO_RM64   (0x31) /* /r */
-#define OPCODE_ADD_R64_TO_RM64   (0x01)
+#define OPCODE_ADD_R64_TO_RM64   (0x01) /* /r */
 #define OPCODE_ADD_I32_TO_RM32   (0x81) /* /0 */
 #define OPCODE_ADD_I8_TO_RM32    (0x83) /* /0 */
 #define OPCODE_SUB_R64_FROM_RM64 (0x29)
 #define OPCODE_SUB_I32_FROM_RM64 (0x81) /* /5 */
 #define OPCODE_SUB_I8_FROM_RM64  (0x83) /* /5 */
-#define OPCODE_SHL_RM32_BY_I8    (0xc1) /* /4 */
-#define OPCODE_SHR_RM32_BY_I8    (0xc1) /* /5 */
-#define OPCODE_SAR_RM32_BY_I8    (0xc1) /* /7 */
-#define OPCODE_CMP_I32_WITH_RM32 (0x81) /* /7 */
-#define OPCODE_CMP_I8_WITH_RM32  (0x83) /* /7 */
-#define OPCODE_CMP_R64_WITH_RM64 (0x39)
-#define OPCODE_CMP_RM32_WITH_R32 (0x3b)
+//#define OPCODE_SHL_RM32_BY_I8    (0xc1) /* /4 */
+//#define OPCODE_SHR_RM32_BY_I8    (0xc1) /* /5 */
+//#define OPCODE_SAR_RM32_BY_I8    (0xc1) /* /7 */
+#define OPCODE_SHL_RM64_CL       (0xd3) /* /4 */
+#define OPCODE_SAR_RM64_CL       (0xd3) /* /7 */
+//#define OPCODE_CMP_I32_WITH_RM32 (0x81) /* /7 */
+//#define OPCODE_CMP_I8_WITH_RM32  (0x83) /* /7 */
+#define OPCODE_CMP_R64_WITH_RM64 (0x39) /* /r */
+//#define OPCODE_CMP_RM32_WITH_R32 (0x3b)
 #define OPCODE_TEST_R8_WITH_RM8  (0x84) /* /r */
 #define OPCODE_JMP_REL8          (0xeb)
 #define OPCODE_JMP_REL32         (0xe9)
@@ -253,6 +255,10 @@ STATIC void asm_x64_write_r64_disp(asm_x64_t *as, int r64, int disp_r64, int dis
     }
 }
 
+STATIC void asm_x64_generic_r64_r64(asm_x64_t *as, int dest_r64, int src_r64, int op) {
+    asm_x64_write_byte_3(as, REX_PREFIX | REX_W | (src_r64 < 8 ? 0 : REX_R) | (dest_r64 < 8 ? 0 : REX_B), op, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+}
+
 void asm_x64_nop(asm_x64_t *as) {
     asm_x64_write_byte_1(as, OPCODE_NOP);
 }
@@ -290,9 +296,8 @@ STATIC void asm_x64_ret(asm_x64_t *as) {
     asm_x64_write_byte_1(as, OPCODE_RET);
 }
 
-void asm_x64_mov_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    // use REX prefix for 64 bit operation
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W | (src_r64 < 8 ? 0 : REX_R) | (dest_r64 < 8 ? 0 : REX_B), OPCODE_MOV_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_mov_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_MOV_R64_TO_RM64);
 }
 
 void asm_x64_mov_r8_to_disp(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp) {
@@ -377,30 +382,24 @@ void asm_x64_mov_i64_to_r64_aligned(asm_x64_t *as, int64_t src_i64, int dest_r64
     asm_x64_mov_i64_to_r64(as, src_i64, dest_r64);
 }
 
-void asm_x64_xor_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    assert(src_r64 < 8);
-    assert(dest_r64 < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_XOR_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_xor_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_XOR_R64_TO_RM64);
 }
 
-void asm_x64_add_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    assert(src_r64 < 8);
-    assert(dest_r64 < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_ADD_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_shl_r64_cl(asm_x64_t* as, int dest_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, 4, OPCODE_SHL_RM64_CL);
 }
 
-/*
-void asm_x64_sub_r32_from_r32(asm_x64_t *as, int src_r32, int dest_r32) {
-    // defaults to 32 bit operation
-    asm_x64_write_byte_2(as, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r32) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
+void asm_x64_sar_r64_cl(asm_x64_t* as, int dest_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, 7, OPCODE_SAR_RM64_CL);
 }
-*/
 
-void asm_x64_sub_r64_from_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    // use REX prefix for 64 bit operation
-    assert(src_r64 < 8);
-    assert(dest_r64 < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_add_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_ADD_R64_TO_RM64);
+}
+
+void asm_x64_sub_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_SUB_R64_FROM_RM64);
 }
 
 /*
@@ -417,7 +416,7 @@ void asm_x64_sub_i32_from_r32(asm_x64_t *as, int src_i32, int dest_r32) {
 }
 */
 
-void asm_x64_sub_i32_from_r64(asm_x64_t *as, int src_i32, int dest_r64) {
+STATIC void asm_x64_sub_r64_i32(asm_x64_t *as, int dest_r64, int src_i32) {
     assert(dest_r64 < 8);
     if (SIGNED_FIT8(src_i32)) {
         // use REX prefix for 64 bit operation
@@ -448,9 +447,7 @@ void asm_x64_sar_r32_by_imm(asm_x64_t *as, int r32, int imm) {
 */
 
 void asm_x64_cmp_r64_with_r64(asm_x64_t *as, int src_r64_a, int src_r64_b) {
-    assert(src_r64_a < 8);
-    assert(src_r64_b < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_CMP_R64_WITH_RM64, MODRM_R64(src_r64_a) | MODRM_RM_REG | MODRM_RM_R64(src_r64_b));
+    asm_x64_generic_r64_r64(as, src_r64_b, src_r64_a, OPCODE_CMP_R64_WITH_RM64);
 }
 
 /*
@@ -541,12 +538,12 @@ void asm_x64_jcc_label(asm_x64_t *as, int jcc_type, int label) {
 
 void asm_x64_entry(asm_x64_t *as, int num_locals) {
     asm_x64_push_r64(as, ASM_X64_REG_RBP);
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RSP, ASM_X64_REG_RBP);
+    asm_x64_mov_r64_r64(as, ASM_X64_REG_RBP, ASM_X64_REG_RSP);
     if (num_locals < 0) {
         num_locals = 0;
     }
     num_locals |= 1; // make it odd so stack is aligned on 16 byte boundary
-    asm_x64_sub_i32_from_r64(as, num_locals * WORD_SIZE, ASM_X64_REG_RSP);
+    asm_x64_sub_r64_i32(as, ASM_X64_REG_RSP, num_locals * WORD_SIZE);
     asm_x64_push_r64(as, ASM_X64_REG_RBX);
     asm_x64_push_r64(as, ASM_X64_REG_R12);
     asm_x64_push_r64(as, ASM_X64_REG_R13);
@@ -587,7 +584,7 @@ void asm_x64_mov_r64_to_local(asm_x64_t *as, int src_r64, int dest_local_num) {
 void asm_x64_mov_local_addr_to_r64(asm_x64_t *as, int local_num, int dest_r64) {
     int offset = asm_x64_local_offset_from_ebp(as, local_num);
     if (offset == 0) {
-        asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, dest_r64);
+        asm_x64_mov_r64_r64(as, dest_r64, ASM_X64_REG_RBP);
     } else {
         asm_x64_lea_disp_to_r64(as, ASM_X64_REG_RBP, offset, dest_r64);
     }
@@ -600,7 +597,7 @@ void asm_x64_push_local(asm_x64_t *as, int local_num) {
 
 void asm_x64_push_local_addr(asm_x64_t *as, int local_num, int temp_r64)
 {
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, temp_r64);
+    asm_x64_mov_r64_r64(as, temp_r64, ASM_X64_REG_RBP);
     asm_x64_add_i32_to_r32(as, asm_x64_local_offset_from_ebp(as, local_num), temp_r64);
     asm_x64_push_r64(as, temp_r64);
 }
@@ -614,7 +611,7 @@ void asm_x64_call(asm_x64_t *as, void* func)
     asm_x64_sub_i32_from_r32(as, 8, ASM_X64_REG_RSP);
     asm_x64_write_byte_1(as, OPCODE_CALL_REL32);
     asm_x64_write_word32(as, func - (void*)(as->code_cur + 4));
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, ASM_X64_REG_RSP);
+    asm_x64_mov_r64_r64(as, ASM_X64_REG_RSP, ASM_X64_REG_RBP);
 }
 
 void asm_x64_call_i1(asm_x64_t *as, void* func, int i1)
@@ -625,7 +622,7 @@ void asm_x64_call_i1(asm_x64_t *as, void* func, int i1)
     asm_x64_write_byte_1(as, OPCODE_CALL_REL32);
     asm_x64_write_word32(as, func - (void*)(as->code_cur + 4));
     asm_x64_add_i32_to_r32(as, 16, ASM_X64_REG_RSP);
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, ASM_X64_REG_RSP);
+    asm_x64_mov_r64_r64(as, ASM_X64_REG_RSP, ASM_X64_REG_RBP);
 }
 */
 
diff --git a/py/asmx64.h b/py/asmx64.h
index 3b138a753..0d3f58ecd 100644
--- a/py/asmx64.h
+++ b/py/asmx64.h
@@ -31,6 +31,11 @@
 //  - RAX, RCX, RDX, RSI, RDI, R08, R09, R10, R11 are caller-save
 //  - RBX, RBP, R12, R13, R14, R15 are callee-save
 
+// In the functions below, argument order follows x86 docs and generally
+// the destination is the first argument.
+// NOTE: this is a change from the old convention used in this file and
+// some functions still use the old (reverse) convention.
+
 #define ASM_X64_PASS_COMPUTE (1)
 #define ASM_X64_PASS_EMIT    (2)
 
@@ -58,6 +63,8 @@
 #define ASM_X64_CC_JNZ (0x5)
 #define ASM_X64_CC_JNE (0x5)
 #define ASM_X64_CC_JL  (0xc) // less, signed
+#define ASM_X64_CC_JGE (0xd) // greater or equal, signed
+#define ASM_X64_CC_JLE (0xe) // less or equal, signed
 #define ASM_X64_CC_JG  (0xf) // greater, signed
 
 typedef struct _asm_x64_t asm_x64_t;
@@ -72,15 +79,18 @@ void* asm_x64_get_code(asm_x64_t* as);
 void asm_x64_nop(asm_x64_t* as);
 void asm_x64_push_r64(asm_x64_t* as, int src_r64);
 void asm_x64_pop_r64(asm_x64_t* as, int dest_r64);
-void asm_x64_mov_r64_to_r64(asm_x64_t* as, int src_r64, int dest_r64);
+void asm_x64_mov_r64_r64(asm_x64_t* as, int dest_r64, int src_r64);
 void asm_x64_mov_i64_to_r64(asm_x64_t* as, int64_t src_i64, int dest_r64);
 void asm_x64_mov_i64_to_r64_optimised(asm_x64_t *as, int64_t src_i64, int dest_r64);
 void asm_x64_mov_i64_to_r64_aligned(asm_x64_t *as, int64_t src_i64, int dest_r64);
 void asm_x64_mov_r8_to_disp(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp);
 void asm_x64_mov_r16_to_disp(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp);
 void asm_x64_mov_r64_to_disp(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp);
-void asm_x64_xor_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64);
-void asm_x64_add_r64_to_r64(asm_x64_t* as, int src_r64, int dest_r64);
+void asm_x64_xor_r64_r64(asm_x64_t *as, int dest_r64, int src_r64);
+void asm_x64_shl_r64_cl(asm_x64_t* as, int dest_r64);
+void asm_x64_sar_r64_cl(asm_x64_t* as, int dest_r64);
+void asm_x64_add_r64_r64(asm_x64_t* as, int dest_r64, int src_r64);
+void asm_x64_sub_r64_r64(asm_x64_t* as, int dest_r64, int src_r64);
 void asm_x64_cmp_r64_with_r64(asm_x64_t* as, int src_r64_a, int src_r64_b);
 void asm_x64_test_r8_with_r8(asm_x64_t* as, int src_r64_a, int src_r64_b);
 void asm_x64_setcc_r8(asm_x64_t* as, int jcc_type, int dest_r8);
diff --git a/py/asmx86.c b/py/asmx86.c
index 08299f851..072998c67 100644
--- a/py/asmx86.c
+++ b/py/asmx86.c
@@ -57,12 +57,14 @@
 #define OPCODE_ADD_R32_TO_RM32   (0x01)
 #define OPCODE_ADD_I32_TO_RM32   (0x81) /* /0 */
 #define OPCODE_ADD_I8_TO_RM32    (0x83) /* /0 */
-//#define OPCODE_SUB_R32_FROM_RM32 (0x29)
+#define OPCODE_SUB_R32_FROM_RM32 (0x29)
 #define OPCODE_SUB_I32_FROM_RM32 (0x81) /* /5 */
 #define OPCODE_SUB_I8_FROM_RM32  (0x83) /* /5 */
 //#define OPCODE_SHL_RM32_BY_I8    (0xc1) /* /4 */
 //#define OPCODE_SHR_RM32_BY_I8    (0xc1) /* /5 */
 //#define OPCODE_SAR_RM32_BY_I8    (0xc1) /* /7 */
+#define OPCODE_SHL_RM32_CL       (0xd3) /* /4 */
+#define OPCODE_SAR_RM32_CL       (0xd3) /* /7 */
 //#define OPCODE_CMP_I32_WITH_RM32 (0x81) /* /7 */
 //#define OPCODE_CMP_I8_WITH_RM32  (0x83) /* /7 */
 #define OPCODE_CMP_R32_WITH_RM32 (0x39)
@@ -204,6 +206,10 @@ STATIC void asm_x86_write_r32_disp(asm_x86_t *as, int r32, int disp_r32, int dis
     }
 }
 
+STATIC void asm_x86_generic_r32_r32(asm_x86_t *as, int dest_r32, int src_r32, int op) {
+    asm_x86_write_byte_2(as, op, MODRM_R32(src_r32) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
+}
+
 STATIC void asm_x86_nop(asm_x86_t *as) {
     asm_x86_write_byte_1(as, OPCODE_NOP);
 }
@@ -232,8 +238,8 @@ STATIC void asm_x86_ret(asm_x86_t *as) {
     asm_x86_write_byte_1(as, OPCODE_RET);
 }
 
-void asm_x86_mov_r32_to_r32(asm_x86_t *as, int src_r32, int dest_r32) {
-    asm_x86_write_byte_2(as, OPCODE_MOV_R32_TO_RM32, MODRM_R32(src_r32) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
+void asm_x86_mov_r32_r32(asm_x86_t *as, int dest_r32, int src_r32) {
+    asm_x86_generic_r32_r32(as, dest_r32, src_r32, OPCODE_MOV_R32_TO_RM32);
 }
 
 void asm_x86_mov_r8_to_disp(asm_x86_t *as, int src_r32, int dest_r32, int dest_disp) {
@@ -281,12 +287,20 @@ void asm_x86_mov_i32_to_r32_aligned(asm_x86_t *as, int32_t src_i32, int dest_r32
     asm_x86_mov_i32_to_r32(as, src_i32, dest_r32);
 }
 
-void asm_x86_xor_r32_to_r32(asm_x86_t *as, int src_r32, int dest_r32) {
-    asm_x86_write_byte_2(as, OPCODE_XOR_R32_TO_RM32, MODRM_R32(src_r32) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
+void asm_x86_xor_r32_r32(asm_x86_t *as, int dest_r32, int src_r32) {
+    asm_x86_generic_r32_r32(as, dest_r32, src_r32, OPCODE_XOR_R32_TO_RM32);
+}
+
+void asm_x86_shl_r32_cl(asm_x86_t* as, int dest_r32) {
+    asm_x86_generic_r32_r32(as, dest_r32, 4, OPCODE_SHL_RM32_CL);
 }
 
-void asm_x86_add_r32_to_r32(asm_x86_t *as, int src_r32, int dest_r32) {
-    asm_x86_write_byte_2(as, OPCODE_ADD_R32_TO_RM32, MODRM_R32(src_r32) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
+void asm_x86_sar_r32_cl(asm_x86_t* as, int dest_r32) {
+    asm_x86_generic_r32_r32(as, dest_r32, 7, OPCODE_SAR_RM32_CL);
+}
+
+void asm_x86_add_r32_r32(asm_x86_t *as, int dest_r32, int src_r32) {
+    asm_x86_generic_r32_r32(as, dest_r32, src_r32, OPCODE_ADD_R32_TO_RM32);
 }
 
 void asm_x86_add_i32_to_r32(asm_x86_t *as, int src_i32, int dest_r32) {
@@ -299,13 +313,11 @@ void asm_x86_add_i32_to_r32(asm_x86_t *as, int src_i32, int dest_r32) {
     }
 }
 
-#if 0
-void asm_x86_sub_r32_from_r32(asm_x86_t *as, int src_r32, int dest_r32) {
-    asm_x86_write_byte_2(as, OPCODE_SUB_R32_FROM_RM32, MODRM_R32(src_r32) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
+void asm_x86_sub_r32_r32(asm_x86_t *as, int dest_r32, int src_r32) {
+    asm_x86_generic_r32_r32(as, dest_r32, src_r32, OPCODE_SUB_R32_FROM_RM32);
 }
-#endif
 
-void asm_x86_sub_i32_from_r32(asm_x86_t *as, int src_i32, int dest_r32) {
+STATIC void asm_x86_sub_r32_i32(asm_x86_t *as, int dest_r32, int src_i32) {
     if (SIGNED_FIT8(src_i32)) {
         // defaults to 32 bit operation
         asm_x86_write_byte_2(as, OPCODE_SUB_I8_FROM_RM32, MODRM_R32(5) | MODRM_RM_REG | MODRM_RM_R32(dest_r32));
@@ -426,9 +438,9 @@ void asm_x86_jcc_label(asm_x86_t *as, mp_uint_t jcc_type, mp_uint_t label) {
 
 void asm_x86_entry(asm_x86_t *as, mp_uint_t num_locals) {
     asm_x86_push_r32(as, ASM_X86_REG_EBP);
-    asm_x86_mov_r32_to_r32(as, ASM_X86_REG_ESP, ASM_X86_REG_EBP);
+    asm_x86_mov_r32_r32(as, ASM_X86_REG_EBP, ASM_X86_REG_ESP);
     if (num_locals > 0) {
-        asm_x86_sub_i32_from_r32(as, num_locals * WORD_SIZE, ASM_X86_REG_ESP);
+        asm_x86_sub_r32_i32(as, ASM_X86_REG_ESP, num_locals * WORD_SIZE);
     }
     asm_x86_push_r32(as, ASM_X86_REG_EBX);
     asm_x86_push_r32(as, ASM_X86_REG_ESI);
@@ -487,7 +499,7 @@ void asm_x86_mov_r32_to_local(asm_x86_t *as, int src_r32, int dest_local_num) {
 void asm_x86_mov_local_addr_to_r32(asm_x86_t *as, int local_num, int dest_r32) {
     int offset = asm_x86_local_offset_from_ebp(as, local_num);
     if (offset == 0) {
-        asm_x86_mov_r32_to_r32(as, ASM_X86_REG_EBP, dest_r32);
+        asm_x86_mov_r32_r32(as, dest_r32, ASM_X86_REG_EBP);
     } else {
         asm_x86_lea_disp_to_r32(as, ASM_X86_REG_EBP, offset, dest_r32);
     }
@@ -500,7 +512,7 @@ void asm_x86_push_local(asm_x86_t *as, int local_num) {
 
 void asm_x86_push_local_addr(asm_x86_t *as, int local_num, int temp_r32)
 {
-    asm_x86_mov_r32_to_r32(as, ASM_X86_REG_EBP, temp_r32);
+    asm_x86_mov_r32_r32(as, temp_r32, ASM_X86_REG_EBP);
     asm_x86_add_i32_to_r32(as, asm_x86_local_offset_from_ebp(as, local_num), temp_r32);
     asm_x86_push_r32(as, temp_r32);
 }
diff --git a/py/asmx86.h b/py/asmx86.h
index 0ee192378..2d83f3a65 100644
--- a/py/asmx86.h
+++ b/py/asmx86.h
@@ -32,6 +32,11 @@
 //  - EAX, ECX, EDX are caller-save
 //  - EBX, ESI, EDI, EBP, ESP, EIP are callee-save
 
+// In the functions below, argument order follows x86 docs and generally
+// the destination is the first argument.
+// NOTE: this is a change from the old convention used in this file and
+// some functions still use the old (reverse) convention.
+
 #define ASM_X86_PASS_COMPUTE (1)
 #define ASM_X86_PASS_EMIT    (2)
 
@@ -59,6 +64,8 @@
 #define ASM_X86_CC_JNZ (0x5)
 #define ASM_X86_CC_JNE (0x5)
 #define ASM_X86_CC_JL  (0xc) // less, signed
+#define ASM_X86_CC_JGE (0xd) // greater or equal, signed
+#define ASM_X86_CC_JLE (0xe) // less or equal, signed
 #define ASM_X86_CC_JG  (0xf) // greater, signed
 
 typedef struct _asm_x86_t asm_x86_t;
@@ -70,14 +77,17 @@ void asm_x86_end_pass(asm_x86_t *as);
 mp_uint_t asm_x86_get_code_size(asm_x86_t* as);
 void* asm_x86_get_code(asm_x86_t* as);
 
-void asm_x86_mov_r32_to_r32(asm_x86_t* as, int src_r32, int dest_r32);
+void asm_x86_mov_r32_r32(asm_x86_t* as, int dest_r32, int src_r32);
 void asm_x86_mov_i32_to_r32(asm_x86_t *as, int32_t src_i32, int dest_r32);
 void asm_x86_mov_i32_to_r32_aligned(asm_x86_t *as, int32_t src_i32, int dest_r32);
 void asm_x86_mov_r8_to_disp(asm_x86_t *as, int src_r32, int dest_r32, int dest_disp);
 void asm_x86_mov_r16_to_disp(asm_x86_t *as, int src_r32, int dest_r32, int dest_disp);
 void asm_x86_mov_r32_to_disp(asm_x86_t *as, int src_r32, int dest_r32, int dest_disp);
-void asm_x86_xor_r32_to_r32(asm_x86_t *as, int src_r32, int dest_r32);
-void asm_x86_add_r32_to_r32(asm_x86_t* as, int src_r32, int dest_r32);
+void asm_x86_xor_r32_r32(asm_x86_t *as, int dest_r32, int src_r32);
+void asm_x86_shl_r32_cl(asm_x86_t* as, int dest_r32);
+void asm_x86_sar_r32_cl(asm_x86_t* as, int dest_r32);
+void asm_x86_add_r32_r32(asm_x86_t* as, int dest_r32, int src_r32);
+void asm_x86_sub_r32_r32(asm_x86_t* as, int dest_r32, int src_r32);
 void asm_x86_cmp_r32_with_r32(asm_x86_t* as, int src_r32_a, int src_r32_b);
 void asm_x86_test_r8_with_r8(asm_x86_t* as, int src_r32_a, int src_r32_b);
 void asm_x86_setcc_r8(asm_x86_t* as, mp_uint_t jcc_type, int dest_r8);
diff --git a/py/emitnative.c b/py/emitnative.c
index cfcba27a1..d58da7e78 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -140,9 +140,14 @@
         asm_x64_mov_r64_to_local(as, (reg_temp), (local_num)); \
     } while (false)
 #define ASM_MOV_LOCAL_TO_REG        asm_x64_mov_local_to_r64
-#define ASM_MOV_REG_TO_REG          asm_x64_mov_r64_to_r64
+#define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_x64_mov_r64_r64((as), (reg_dest), (reg_src))
 #define ASM_MOV_LOCAL_ADDR_TO_REG   asm_x64_mov_local_addr_to_r64
 
+#define ASM_LSL_REG(as, reg) asm_x64_shl_r64_cl((as), (reg))
+#define ASM_ASR_REG(as, reg) asm_x64_sar_r64_cl((as), (reg))
+#define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_x64_add_r64_r64((as), (reg_dest), (reg_src))
+#define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_x64_sub_r64_r64((as), (reg_dest), (reg_src))
+
 #elif N_X86
 
 // x86 specific stuff
@@ -256,9 +261,14 @@ STATIC byte mp_f_n_args[MP_F_NUMBER_OF] = {
         asm_x86_mov_r32_to_local(as, (reg_temp), (local_num)); \
     } while (false)
 #define ASM_MOV_LOCAL_TO_REG        asm_x86_mov_local_to_r32
-#define ASM_MOV_REG_TO_REG          asm_x86_mov_r32_to_r32
+#define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_x86_mov_r32_r32((as), (reg_dest), (reg_src))
 #define ASM_MOV_LOCAL_ADDR_TO_REG   asm_x86_mov_local_addr_to_r32
 
+#define ASM_LSL_REG(as, reg) asm_x86_shl_r32_cl((as), (reg))
+#define ASM_ASR_REG(as, reg) asm_x86_sar_r32_cl((as), (reg))
+#define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_x86_add_r32_r32((as), (reg_dest), (reg_src))
+#define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_x86_sub_r32_r32((as), (reg_dest), (reg_src))
+
 #elif N_THUMB
 
 // thumb specific stuff
@@ -323,9 +333,14 @@ STATIC byte mp_f_n_args[MP_F_NUMBER_OF] = {
         asm_thumb_mov_local_reg(as, (local_num), (reg_temp)); \
     } while (false)
 #define ASM_MOV_LOCAL_TO_REG(as, local_num, reg) asm_thumb_mov_reg_local(as, (reg), (local_num))
-#define ASM_MOV_REG_TO_REG(as, reg_src, reg_dest) asm_thumb_mov_reg_reg(as, (reg_dest), (reg_src))
+#define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_thumb_mov_reg_reg((as), (reg_dest), (reg_src))
 #define ASM_MOV_LOCAL_ADDR_TO_REG(as, local_num, reg) asm_thumb_mov_reg_local_addr(as, (reg), (local_num))
 
+#define ASM_LSL_REG_REG(as, reg_dest, reg_shift) asm_thumb_format_4((as), ASM_THUMB_FORMAT_4_LSL, (reg_dest), (reg_shift))
+#define ASM_ASR_REG_REG(as, reg_dest, reg_shift) asm_thumb_format_4((as), ASM_THUMB_FORMAT_4_ASR, (reg_dest), (reg_shift))
+#define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_thumb_add_rlo_rlo_rlo((as), (reg_dest), (reg_dest), (reg_src))
+#define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_thumb_sub_rlo_rlo_rlo((as), (reg_dest), (reg_dest), (reg_src))
+
 #elif N_ARM
 
 // ARM specific stuff
@@ -390,9 +405,15 @@ STATIC byte mp_f_n_args[MP_F_NUMBER_OF] = {
         asm_arm_mov_local_reg(as, (local_num), (reg_temp)); \
     } while (false)
 #define ASM_MOV_LOCAL_TO_REG(as, local_num, reg) asm_arm_mov_reg_local(as, (reg), (local_num))
-#define ASM_MOV_REG_TO_REG(as, reg_src, reg_dest) asm_arm_mov_reg_reg(as, (reg_dest), (reg_src))
+#define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_arm_mov_reg_reg((as), (reg_dest), (reg_src))
 #define ASM_MOV_LOCAL_ADDR_TO_REG(as, local_num, reg) asm_arm_mov_reg_local_addr(as, (reg), (local_num))
 
+// TODO someone please implement lsl and asr
+#define ASM_LSL_REG_REG(as, reg_dest, reg_shift) asm_arm_lsl_((as), (reg_dest), (reg_shift))
+#define ASM_ASR_REG_REG(as, reg_dest, reg_shift) asm_arm_asr_((as), (reg_dest), (reg_shift))
+#define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_arm_add_reg_reg_reg((as), (reg_dest), (reg_dest), (reg_src))
+#define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_arm_sub_reg_reg_reg((as), (reg_dest), (reg_dest), (reg_src))
+
 #else
 
 #error unknown native emitter
@@ -544,11 +565,11 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
 #if N_X64
     for (int i = 0; i < scope->num_pos_args; i++) {
         if (i == 0) {
-            asm_x64_mov_r64_to_r64(emit->as, REG_ARG_1, REG_LOCAL_1);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_1, REG_ARG_1);
         } else if (i == 1) {
-            asm_x64_mov_r64_to_r64(emit->as, REG_ARG_2, REG_LOCAL_2);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_2, REG_ARG_2);
         } else if (i == 2) {
-            asm_x64_mov_r64_to_r64(emit->as, REG_ARG_3, REG_LOCAL_3);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_3, REG_ARG_3);
         } else if (i == 3) {
             asm_x64_mov_r64_to_local(emit->as, REG_ARG_4, i - REG_LOCAL_NUM);
         } else {
@@ -572,11 +593,11 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
 #elif N_THUMB
     for (int i = 0; i < scope->num_pos_args; i++) {
         if (i == 0) {
-            asm_thumb_mov_reg_reg(emit->as, REG_LOCAL_1, REG_ARG_1);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_1, REG_ARG_1);
         } else if (i == 1) {
-            asm_thumb_mov_reg_reg(emit->as, REG_LOCAL_2, REG_ARG_2);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_2, REG_ARG_2);
         } else if (i == 2) {
-            asm_thumb_mov_reg_reg(emit->as, REG_LOCAL_3, REG_ARG_3);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_3, REG_ARG_3);
         } else if (i == 3) {
             asm_thumb_mov_local_reg(emit->as, i - REG_LOCAL_NUM, REG_ARG_4);
         } else {
@@ -589,11 +610,11 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
 #elif N_ARM
     for (int i = 0; i < scope->num_pos_args; i++) {
         if (i == 0) {
-            asm_arm_mov_reg_reg(emit->as, REG_LOCAL_1, REG_ARG_1);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_1, REG_ARG_1);
         } else if (i == 1) {
-            asm_arm_mov_reg_reg(emit->as, REG_LOCAL_2, REG_ARG_2);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_2, REG_ARG_2);
         } else if (i == 2) {
-            asm_arm_mov_reg_reg(emit->as, REG_LOCAL_3, REG_ARG_3);
+            ASM_MOV_REG_REG(emit->as, REG_LOCAL_3, REG_ARG_3);
         } else if (i == 3) {
             asm_arm_mov_local_reg(emit->as, i - REG_LOCAL_NUM, REG_ARG_4);
         } else {
@@ -698,8 +719,14 @@ STATIC void emit_native_pre(emit_t *emit) {
     */
 }
 
-STATIC vtype_kind_t peek_vtype(emit_t *emit) {
-    return emit->stack_info[emit->stack_size - 1].vtype;
+// depth==0 is top, depth==1 is before top, etc
+STATIC stack_info_t *peek_stack(emit_t *emit, mp_uint_t depth) {
+    return &emit->stack_info[emit->stack_size - 1 - depth];
+}
+
+// depth==0 is top, depth==1 is before top, etc
+STATIC vtype_kind_t peek_vtype(emit_t *emit, mp_uint_t depth) {
+    return peek_stack(emit, depth)->vtype;
 }
 
 // pos=1 is TOS, pos=2 is next, etc
@@ -759,7 +786,7 @@ STATIC void emit_access_stack(emit_t *emit, int pos, vtype_kind_t *vtype, int re
 
         case STACK_REG:
             if (si->u_reg != reg_dest) {
-                ASM_MOV_REG_TO_REG(emit->as, si->u_reg, reg_dest);
+                ASM_MOV_REG_REG(emit->as, reg_dest, si->u_reg);
             }
             break;
 
@@ -769,6 +796,21 @@ STATIC void emit_access_stack(emit_t *emit, int pos, vtype_kind_t *vtype, int re
     }
 }
 
+// If stacked value is in a register, then *reg_dest is set to that register.
+// Otherwise, the value is put in *reg_dest.
+STATIC void emit_pre_pop_reg_flexible(emit_t *emit, vtype_kind_t *vtype, int *reg_dest) {
+    emit->last_emit_was_return_value = false;
+    stack_info_t *si = peek_stack(emit, 0);
+    if (si->kind == STACK_REG) {
+        *vtype = si->vtype;
+        *reg_dest = si->u_reg;
+        need_reg_single(emit, *reg_dest, 1);
+    } else {
+        emit_access_stack(emit, 1, vtype, *reg_dest);
+    }
+    adjust_stack(emit, -1);
+}
+
 STATIC void emit_pre_pop_discard(emit_t *emit) {
     emit->last_emit_was_return_value = false;
     adjust_stack(emit, -1);
@@ -1250,13 +1292,13 @@ STATIC void emit_native_store_name(emit_t *emit, qstr qst) {
 }
 
 STATIC void emit_native_store_global(emit_t *emit, qstr qst) {
-    vtype_kind_t vtype = peek_vtype(emit);
+    vtype_kind_t vtype = peek_vtype(emit, 0);
     if (vtype == VTYPE_PYOBJ) {
         emit_pre_pop_reg(emit, &vtype, REG_ARG_2);
     } else {
         emit_pre_pop_reg(emit, &vtype, REG_ARG_1);
         emit_call_with_imm_arg(emit, MP_F_CONVERT_NATIVE_TO_OBJ, vtype, REG_ARG_2); // arg2 = type
-        ASM_MOV_REG_TO_REG(emit->as, REG_RET, REG_ARG_2);
+        ASM_MOV_REG_REG(emit->as, REG_ARG_2, REG_RET);
     }
     emit_call_with_imm_arg(emit, MP_F_STORE_GLOBAL, qst, REG_ARG_1); // arg1 = name
     emit_post(emit);
@@ -1364,7 +1406,7 @@ STATIC void emit_native_jump(emit_t *emit, mp_uint_t label) {
 }
 
 STATIC void emit_native_jump_helper(emit_t *emit, mp_uint_t label, bool pop) {
-    vtype_kind_t vtype = peek_vtype(emit);
+    vtype_kind_t vtype = peek_vtype(emit, 0);
     switch (vtype) {
         case VTYPE_PYOBJ:
             emit_pre_pop_reg(emit, &vtype, REG_ARG_1);
@@ -1507,7 +1549,7 @@ STATIC void emit_native_unary_op(emit_t *emit, mp_unary_op_t op) {
     if (op == MP_UNARY_OP_NOT) {
         // we need to synthesise this operation by converting to bool first
         emit_call_with_imm_arg(emit, MP_F_UNARY_OP, MP_UNARY_OP_BOOL, REG_ARG_1);
-        ASM_MOV_REG_TO_REG(emit->as, REG_RET, REG_ARG_2);
+        ASM_MOV_REG_REG(emit->as, REG_ARG_2, REG_RET);
     }
     emit_call_with_imm_arg(emit, MP_F_UNARY_OP, op, REG_ARG_1);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
@@ -1515,47 +1557,108 @@ STATIC void emit_native_unary_op(emit_t *emit, mp_unary_op_t op) {
 
 STATIC void emit_native_binary_op(emit_t *emit, mp_binary_op_t op) {
     DEBUG_printf("binary_op(" UINT_FMT ")\n", op);
-    vtype_kind_t vtype_lhs, vtype_rhs;
-    emit_pre_pop_reg_reg(emit, &vtype_rhs, REG_ARG_3, &vtype_lhs, REG_ARG_2);
+    vtype_kind_t vtype_lhs = peek_vtype(emit, 1);
+    vtype_kind_t vtype_rhs = peek_vtype(emit, 0);
     if (vtype_lhs == VTYPE_INT && vtype_rhs == VTYPE_INT) {
-        if (op == MP_BINARY_OP_ADD || op == MP_BINARY_OP_INPLACE_ADD) {
-#if N_X64
-            asm_x64_add_r64_to_r64(emit->as, REG_ARG_3, REG_ARG_2);
-#elif N_X86
-            asm_x86_add_r32_to_r32(emit->as, REG_ARG_3, REG_ARG_2);
-#elif N_THUMB
-            asm_thumb_add_rlo_rlo_rlo(emit->as, REG_ARG_2, REG_ARG_2, REG_ARG_3);
-#elif N_ARM
-            asm_arm_add_reg(emit->as, REG_ARG_2, REG_ARG_2, REG_ARG_3);
-#else
-    #error not implemented
-#endif
+        #if N_X64 || N_X86
+        // special cases for x86 and shifting
+        if (op == MP_BINARY_OP_LSHIFT
+            || op == MP_BINARY_OP_INPLACE_LSHIFT
+            || op == MP_BINARY_OP_RSHIFT
+            || op == MP_BINARY_OP_INPLACE_RSHIFT) {
+            #if N_X64
+            emit_pre_pop_reg_reg(emit, &vtype_rhs, ASM_X64_REG_RCX, &vtype_lhs, REG_RET);
+            #else
+            emit_pre_pop_reg_reg(emit, &vtype_rhs, ASM_X86_REG_ECX, &vtype_lhs, REG_RET);
+            #endif
+            if (op == MP_BINARY_OP_LSHIFT || op == MP_BINARY_OP_INPLACE_LSHIFT) {
+                ASM_LSL_REG(emit->as, REG_RET);
+            } else {
+                ASM_ASR_REG(emit->as, REG_RET);
+            }
+            emit_post_push_reg(emit, VTYPE_INT, REG_RET);
+            return;
+        }
+        #endif
+        int reg_rhs = REG_ARG_3;
+        emit_pre_pop_reg_flexible(emit, &vtype_rhs, &reg_rhs);
+        emit_pre_pop_reg(emit, &vtype_lhs, REG_ARG_2);
+        if (0) {
+            // dummy
+        #if !(N_X64 || N_X86)
+        } else if (op == MP_BINARY_OP_LSHIFT || op == MP_BINARY_OP_INPLACE_LSHIFT) {
+            ASM_LSL_REG_REG(emit->as, REG_ARG_2, reg_rhs);
             emit_post_push_reg(emit, VTYPE_INT, REG_ARG_2);
-        } else if (op == MP_BINARY_OP_LESS) {
-#if N_X64
-            asm_x64_xor_r64_to_r64(emit->as, REG_RET, REG_RET);
-            asm_x64_cmp_r64_with_r64(emit->as, REG_ARG_3, REG_ARG_2);
-            asm_x64_setcc_r8(emit->as, ASM_X64_CC_JL, REG_RET);
-#elif N_X86
-            asm_x86_xor_r32_to_r32(emit->as, REG_RET, REG_RET);
-            asm_x86_cmp_r32_with_r32(emit->as, REG_ARG_3, REG_ARG_2);
-            asm_x86_setcc_r8(emit->as, ASM_X86_CC_JL, REG_RET);
-#elif N_THUMB
-            asm_thumb_cmp_rlo_rlo(emit->as, REG_ARG_2, REG_ARG_3);
-            asm_thumb_op16(emit->as, ASM_THUMB_OP_ITE_GE);
-            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 0); // if r0 >= r1
-            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 1); // if r0 < r1
-#elif N_ARM
-            asm_arm_less_op(emit->as, REG_RET, REG_ARG_2, REG_ARG_3);
-#else
-    #error not implemented
-#endif
+        } else if (op == MP_BINARY_OP_RSHIFT || op == MP_BINARY_OP_INPLACE_RSHIFT) {
+            ASM_ASR_REG_REG(emit->as, REG_ARG_2, reg_rhs);
+            emit_post_push_reg(emit, VTYPE_INT, REG_ARG_2);
+        #endif
+        } else if (op == MP_BINARY_OP_ADD || op == MP_BINARY_OP_INPLACE_ADD) {
+            ASM_ADD_REG_REG(emit->as, REG_ARG_2, reg_rhs);
+            emit_post_push_reg(emit, VTYPE_INT, REG_ARG_2);
+        } else if (op == MP_BINARY_OP_SUBTRACT || op == MP_BINARY_OP_INPLACE_SUBTRACT) {
+            ASM_SUB_REG_REG(emit->as, REG_ARG_2, reg_rhs);
+            emit_post_push_reg(emit, VTYPE_INT, REG_ARG_2);
+        } else if (MP_BINARY_OP_LESS <= op && op <= MP_BINARY_OP_NOT_EQUAL) {
+            // comparison ops are (in enum order):
+            //  MP_BINARY_OP_LESS
+            //  MP_BINARY_OP_MORE
+            //  MP_BINARY_OP_EQUAL
+            //  MP_BINARY_OP_LESS_EQUAL
+            //  MP_BINARY_OP_MORE_EQUAL
+            //  MP_BINARY_OP_NOT_EQUAL
+            #if N_X64
+            asm_x64_xor_r64_r64(emit->as, REG_RET, REG_RET);
+            asm_x64_cmp_r64_with_r64(emit->as, reg_rhs, REG_ARG_2);
+            static byte ops[6] = {
+                ASM_X64_CC_JL,
+                ASM_X64_CC_JG,
+                ASM_X64_CC_JE,
+                ASM_X64_CC_JLE,
+                ASM_X64_CC_JGE,
+                ASM_X64_CC_JNE,
+            };
+            asm_x64_setcc_r8(emit->as, ops[op - MP_BINARY_OP_LESS], REG_RET);
+            #elif N_X86
+            asm_x86_xor_r32_r32(emit->as, REG_RET, REG_RET);
+            asm_x86_cmp_r32_with_r32(emit->as, reg_rhs, REG_ARG_2);
+            static byte ops[6] = {
+                ASM_X86_CC_JL,
+                ASM_X86_CC_JG,
+                ASM_X86_CC_JE,
+                ASM_X86_CC_JLE,
+                ASM_X86_CC_JGE,
+                ASM_X86_CC_JNE,
+            };
+            asm_x86_setcc_r8(emit->as, ops[op - MP_BINARY_OP_LESS], REG_RET);
+            #elif N_THUMB
+            asm_thumb_cmp_rlo_rlo(emit->as, REG_ARG_2, reg_rhs);
+            static uint16_t ops[6] = {
+                ASM_THUMB_OP_ITE_GE,
+                ASM_THUMB_OP_ITE_GT,
+                ASM_THUMB_OP_ITE_EQ,
+                ASM_THUMB_OP_ITE_GT,
+                ASM_THUMB_OP_ITE_GE,
+                ASM_THUMB_OP_ITE_EQ,
+            };
+            static byte ret[6] = { 0, 1, 1, 0, 1, 0, };
+            asm_thumb_op16(emit->as, ops[op - MP_BINARY_OP_LESS]);
+            asm_thumb_mov_rlo_i8(emit->as, REG_RET, ret[op - MP_BINARY_OP_LESS]);
+            asm_thumb_mov_rlo_i8(emit->as, REG_RET, ret[op - MP_BINARY_OP_LESS] ^ 1);
+            #elif N_ARM
+                #error generic comparisons for ARM needs implementing
+            //asm_arm_less_op(emit->as, REG_RET, REG_ARG_2, reg_rhs);
+            //asm_arm_more_op(emit->as, REG_RET, REG_ARG_2, reg_rhs);
+            #else
+                #error not implemented
+            #endif
             emit_post_push_reg(emit, VTYPE_BOOL, REG_RET);
         } else {
             // TODO other ops not yet implemented
             assert(0);
         }
     } else if (vtype_lhs == VTYPE_PYOBJ && vtype_rhs == VTYPE_PYOBJ) {
+        emit_pre_pop_reg_reg(emit, &vtype_rhs, REG_ARG_3, &vtype_lhs, REG_ARG_2);
         bool invert = false;
         if (op == MP_BINARY_OP_NOT_IN) {
             invert = true;
@@ -1566,7 +1669,7 @@ STATIC void emit_native_binary_op(emit_t *emit, mp_binary_op_t op) {
         }
         emit_call_with_imm_arg(emit, MP_F_BINARY_OP, op, REG_ARG_1);
         if (invert) {
-            ASM_MOV_REG_TO_REG(emit->as, REG_RET, REG_ARG_2);
+            ASM_MOV_REG_REG(emit->as, REG_ARG_2, REG_RET);
             emit_call_with_imm_arg(emit, MP_F_UNARY_OP, MP_UNARY_OP_NOT, REG_ARG_1);
         }
         emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
-- 
GitLab