diff --git a/py/asmarm.c b/py/asmarm.c
index 3610f838e607cf34e75d0fd512caf1aa5c815fcd..f2221f8a92642b8b4aa3861e04b7678b4929883c 100644
--- a/py/asmarm.c
+++ b/py/asmarm.c
@@ -197,7 +197,16 @@ void asm_arm_mov_reg_reg(asm_arm_t *as, uint reg_dest, uint reg_src) {
     emit_al(as, asm_arm_op_mov_reg(reg_dest, reg_src));
 }
 
-void asm_arm_mov_reg_i32(asm_arm_t *as, uint rd, int imm) {
+size_t asm_arm_mov_reg_i32(asm_arm_t *as, uint rd, int imm) {
+    // Insert immediate into code and jump over it
+    emit_al(as, 0x59f0000 | (rd << 12)); // ldr rd, [pc]
+    emit_al(as, 0xa000000); // b pc
+    size_t loc = mp_asm_base_get_code_pos(&as->base);
+    emit(as, imm);
+    return loc;
+}
+
+void asm_arm_mov_reg_i32_optimised(asm_arm_t *as, uint rd, int imm) {
     // TODO: There are more variants of immediate values
     if ((imm & 0xFF) == imm) {
         emit_al(as, asm_arm_op_mov_imm(rd, imm));
@@ -205,10 +214,7 @@ void asm_arm_mov_reg_i32(asm_arm_t *as, uint rd, int imm) {
         // mvn is "move not", not "move negative"
         emit_al(as, asm_arm_op_mvn_imm(rd, ~imm));
     } else {
-        //Insert immediate into code and jump over it
-        emit_al(as, 0x59f0000 | (rd << 12)); // ldr rd, [pc]
-        emit_al(as, 0xa000000); // b pc
-        emit(as, imm);
+        asm_arm_mov_reg_i32(as, rd, imm);
     }
 }
 
diff --git a/py/asmarm.h b/py/asmarm.h
index 58a13cc83e7d85a1e514fce9adf34711b24ed2c6..825fd884005ffdd9dd60f0a31821815f9f1790ed 100644
--- a/py/asmarm.h
+++ b/py/asmarm.h
@@ -81,7 +81,8 @@ void asm_arm_bkpt(asm_arm_t *as);
 
 // mov
 void asm_arm_mov_reg_reg(asm_arm_t *as, uint reg_dest, uint reg_src);
-void asm_arm_mov_reg_i32(asm_arm_t *as, uint rd, int imm);
+size_t asm_arm_mov_reg_i32(asm_arm_t *as, uint rd, int imm);
+void asm_arm_mov_reg_i32_optimised(asm_arm_t *as, uint rd, int imm);
 void asm_arm_mov_local_reg(asm_arm_t *as, int local_num, uint rd);
 void asm_arm_mov_reg_local(asm_arm_t *as, uint rd, int local_num);
 void asm_arm_setcc_reg(asm_arm_t *as, uint rd, uint cond);
@@ -177,7 +178,9 @@ void asm_arm_bx_reg(asm_arm_t *as, uint reg_src);
 #define ASM_CALL_IND(as, idx) asm_arm_bl_ind(as, idx, ASM_ARM_REG_R3)
 
 #define ASM_MOV_LOCAL_REG(as, local_num, reg_src) asm_arm_mov_local_reg((as), (local_num), (reg_src))
-#define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_arm_mov_reg_i32((as), (reg_dest), (imm))
+#define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_arm_mov_reg_i32_optimised((as), (reg_dest), (imm))
+#define ASM_MOV_REG_IMM_FIX_U16(as, reg_dest, imm) asm_arm_mov_reg_i32((as), (reg_dest), (imm))
+#define ASM_MOV_REG_IMM_FIX_WORD(as, reg_dest, imm) asm_arm_mov_reg_i32((as), (reg_dest), (imm))
 #define ASM_MOV_REG_LOCAL(as, reg_dest, local_num) asm_arm_mov_reg_local((as), (reg_dest), (local_num))
 #define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_arm_mov_reg_reg((as), (reg_dest), (reg_src))
 #define ASM_MOV_REG_LOCAL_ADDR(as, reg_dest, local_num) asm_arm_mov_reg_local_addr((as), (reg_dest), (local_num))
diff --git a/py/asmthumb.c b/py/asmthumb.c
index 46102395dc0f038b2097872f810fefde14bf9014..e6bba7ea60ebe50984f7ac84435543ef7fb29a13 100644
--- a/py/asmthumb.c
+++ b/py/asmthumb.c
@@ -225,10 +225,12 @@ void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src) {
 }
 
 // if loading lo half with movw, the i16 value will be zero extended into the r32 register!
-void asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src) {
+size_t asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src) {
     assert(reg_dest < ASM_THUMB_REG_R15);
+    size_t loc = mp_asm_base_get_code_pos(&as->base);
     // mov[wt] reg_dest, #i16_src
     asm_thumb_op32(as, mov_op | ((i16_src >> 1) & 0x0400) | ((i16_src >> 12) & 0xf), ((i16_src << 4) & 0x7000) | (reg_dest << 8) | (i16_src & 0xff));
+    return loc;
 }
 
 #define OP_B_N(byte_offset) (0xe000 | (((byte_offset) >> 1) & 0x07ff))
@@ -271,12 +273,16 @@ bool asm_thumb_bl_label(asm_thumb_t *as, uint label) {
     return as->base.pass != MP_ASM_PASS_EMIT || SIGNED_FIT23(rel);
 }
 
-void asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, mp_uint_t i32) {
+size_t asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, mp_uint_t i32) {
     // movw, movt does it in 8 bytes
     // ldr [pc, #], dw does it in 6 bytes, but we might not reach to end of code for dw
 
+    size_t loc = mp_asm_base_get_code_pos(&as->base);
+
     asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVW, reg_dest, i32);
     asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVT, reg_dest, i32 >> 16);
+
+    return loc;
 }
 
 void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32) {
diff --git a/py/asmthumb.h b/py/asmthumb.h
index 9a44a78cae2b1393bbc32a81bd9a3caa92cffbcb..c21c23ff7979828190016cefea8c69422af320a7 100644
--- a/py/asmthumb.h
+++ b/py/asmthumb.h
@@ -241,14 +241,14 @@ static inline void asm_thumb_ldrh_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uin
 #define ASM_THUMB_OP_MOVT (0xf2c0)
 
 void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src);
-void asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src);
+size_t asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src);
 
 // these return true if the destination is in range, false otherwise
 bool asm_thumb_b_n_label(asm_thumb_t *as, uint label);
 bool asm_thumb_bcc_nw_label(asm_thumb_t *as, int cond, uint label, bool wide);
 bool asm_thumb_bl_label(asm_thumb_t *as, uint label);
 
-void asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, mp_uint_t i32_src); // convenience
+size_t asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, mp_uint_t i32_src); // convenience
 void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32_src); // convenience
 void asm_thumb_mov_local_reg(asm_thumb_t *as, int local_num_dest, uint rlo_src); // convenience
 void asm_thumb_mov_reg_local(asm_thumb_t *as, uint rlo_dest, int local_num); // convenience
@@ -315,6 +315,8 @@ void asm_thumb_bl_ind(asm_thumb_t *as, uint fun_id, uint reg_temp); // convenien
 
 #define ASM_MOV_LOCAL_REG(as, local_num, reg) asm_thumb_mov_local_reg((as), (local_num), (reg))
 #define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_thumb_mov_reg_i32_optimised((as), (reg_dest), (imm))
+#define ASM_MOV_REG_IMM_FIX_U16(as, reg_dest, imm) asm_thumb_mov_reg_i16((as), ASM_THUMB_OP_MOVW, (reg_dest), (imm))
+#define ASM_MOV_REG_IMM_FIX_WORD(as, reg_dest, imm) asm_thumb_mov_reg_i32((as), (reg_dest), (imm))
 #define ASM_MOV_REG_LOCAL(as, reg_dest, local_num) asm_thumb_mov_reg_local((as), (reg_dest), (local_num))
 #define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_thumb_mov_reg_reg((as), (reg_dest), (reg_src))
 #define ASM_MOV_REG_LOCAL_ADDR(as, reg_dest, local_num) asm_thumb_mov_reg_local_addr((as), (reg_dest), (local_num))
diff --git a/py/asmx64.c b/py/asmx64.c
index 3609f49d30706e065b47978d2fa5e69bfe73efa3..b18703a9c5abb56deb5dabca06abe60990b7d5ff 100644
--- a/py/asmx64.c
+++ b/py/asmx64.c
@@ -334,14 +334,16 @@ void asm_x64_mov_i8_to_r8(asm_x64_t *as, int src_i8, int dest_r64) {
 }
 */
 
-STATIC void asm_x64_mov_i32_to_r64(asm_x64_t *as, int src_i32, int dest_r64) {
+size_t asm_x64_mov_i32_to_r64(asm_x64_t *as, int src_i32, int dest_r64) {
     // cpu defaults to i32 to r64, with zero extension
     if (dest_r64 < 8) {
         asm_x64_write_byte_1(as, OPCODE_MOV_I64_TO_R64 | dest_r64);
     } else {
         asm_x64_write_byte_2(as, REX_PREFIX | REX_B, OPCODE_MOV_I64_TO_R64 | (dest_r64 & 7));
     }
+    size_t loc = mp_asm_base_get_code_pos(&as->base);
     asm_x64_write_word32(as, src_i32);
+    return loc;
 }
 
 void asm_x64_mov_i64_to_r64(asm_x64_t *as, int64_t src_i64, int dest_r64) {
diff --git a/py/asmx64.h b/py/asmx64.h
index 1c8755a84c503c47b02841a0feeca6210e1fdadc..d3761b78f33991a6aade364556f87790211979c3 100644
--- a/py/asmx64.h
+++ b/py/asmx64.h
@@ -83,6 +83,7 @@ void asm_x64_nop(asm_x64_t* as);
 void asm_x64_push_r64(asm_x64_t* as, int src_r64);
 void asm_x64_pop_r64(asm_x64_t* as, int dest_r64);
 void asm_x64_mov_r64_r64(asm_x64_t* as, int dest_r64, int src_r64);
+size_t asm_x64_mov_i32_to_r64(asm_x64_t *as, int src_i32, int dest_r64);
 void asm_x64_mov_i64_to_r64(asm_x64_t* as, int64_t src_i64, int dest_r64);
 void asm_x64_mov_i64_to_r64_optimised(asm_x64_t *as, int64_t src_i64, int dest_r64);
 void asm_x64_mov_r8_to_mem8(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp);
@@ -181,6 +182,8 @@ void asm_x64_call_ind(asm_x64_t* as, size_t fun_id, int temp_r32);
 
 #define ASM_MOV_LOCAL_REG(as, local_num, reg_src) asm_x64_mov_r64_to_local((as), (reg_src), (local_num))
 #define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_x64_mov_i64_to_r64_optimised((as), (imm), (reg_dest))
+#define ASM_MOV_REG_IMM_FIX_U16(as, reg_dest, imm) asm_x64_mov_i32_to_r64((as), (imm), (reg_dest))
+#define ASM_MOV_REG_IMM_FIX_WORD(as, reg_dest, imm) asm_x64_mov_i32_to_r64((as), (imm), (reg_dest))
 #define ASM_MOV_REG_LOCAL(as, reg_dest, local_num) asm_x64_mov_local_to_r64((as), (local_num), (reg_dest))
 #define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_x64_mov_r64_r64((as), (reg_dest), (reg_src))
 #define ASM_MOV_REG_LOCAL_ADDR(as, reg_dest, local_num) asm_x64_mov_local_addr_to_r64((as), (local_num), (reg_dest))
diff --git a/py/asmx86.c b/py/asmx86.c
index 8ce576ac89dfcfb4f7be04e3a5a0baf44537c615..23160c9c20026ba7ddb09c9f23f263766feba9f6 100644
--- a/py/asmx86.c
+++ b/py/asmx86.c
@@ -236,9 +236,11 @@ void asm_x86_mov_i8_to_r8(asm_x86_t *as, int src_i8, int dest_r32) {
 }
 #endif
 
-void asm_x86_mov_i32_to_r32(asm_x86_t *as, int32_t src_i32, int dest_r32) {
+size_t asm_x86_mov_i32_to_r32(asm_x86_t *as, int32_t src_i32, int dest_r32) {
     asm_x86_write_byte_1(as, OPCODE_MOV_I32_TO_R32 | dest_r32);
+    size_t loc = mp_asm_base_get_code_pos(&as->base);
     asm_x86_write_word32(as, src_i32);
+    return loc;
 }
 
 void asm_x86_and_r32_r32(asm_x86_t *as, int dest_r32, int src_r32) {
diff --git a/py/asmx86.h b/py/asmx86.h
index 82a8629ddfa174fd0a61a07c65559e86618e06bb..7ba677b2c2c3e14b9dd84b776feba5488eca3206 100644
--- a/py/asmx86.h
+++ b/py/asmx86.h
@@ -83,7 +83,7 @@ static inline void asm_x86_end_pass(asm_x86_t *as) {
 }
 
 void asm_x86_mov_r32_r32(asm_x86_t* as, int dest_r32, int src_r32);
-void asm_x86_mov_i32_to_r32(asm_x86_t *as, int32_t src_i32, int dest_r32);
+size_t asm_x86_mov_i32_to_r32(asm_x86_t *as, int32_t src_i32, int dest_r32);
 void asm_x86_mov_r8_to_mem8(asm_x86_t *as, int src_r32, int dest_r32, int dest_disp);
 void asm_x86_mov_r16_to_mem16(asm_x86_t *as, int src_r32, int dest_r32, int dest_disp);
 void asm_x86_mov_r32_to_mem32(asm_x86_t *as, int src_r32, int dest_r32, int dest_disp);
@@ -179,6 +179,8 @@ void asm_x86_call_ind(asm_x86_t* as, size_t fun_id, mp_uint_t n_args, int temp_r
 
 #define ASM_MOV_LOCAL_REG(as, local_num, reg_src) asm_x86_mov_r32_to_local((as), (reg_src), (local_num))
 #define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_x86_mov_i32_to_r32((as), (imm), (reg_dest))
+#define ASM_MOV_REG_IMM_FIX_U16(as, reg_dest, imm) asm_x86_mov_i32_to_r32((as), (imm), (reg_dest))
+#define ASM_MOV_REG_IMM_FIX_WORD(as, reg_dest, imm) asm_x86_mov_i32_to_r32((as), (imm), (reg_dest))
 #define ASM_MOV_REG_LOCAL(as, reg_dest, local_num) asm_x86_mov_local_to_r32((as), (local_num), (reg_dest))
 #define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_x86_mov_r32_r32((as), (reg_dest), (reg_src))
 #define ASM_MOV_REG_LOCAL_ADDR(as, reg_dest, local_num) asm_x86_mov_local_addr_to_r32((as), (local_num), (reg_dest))
diff --git a/py/asmxtensa.c b/py/asmxtensa.c
index 8da56ffe30ef377c358f40cda25a8d2ea03e0459..a269e5e7fc0d1710d2702c27e63b38609b16bc2c 100644
--- a/py/asmxtensa.c
+++ b/py/asmxtensa.c
@@ -156,18 +156,24 @@ void asm_xtensa_setcc_reg_reg_reg(asm_xtensa_t *as, uint cond, uint reg_dest, ui
     asm_xtensa_op_movi_n(as, reg_dest, 0);
 }
 
-void asm_xtensa_mov_reg_i32(asm_xtensa_t *as, uint reg_dest, uint32_t i32) {
+size_t asm_xtensa_mov_reg_i32(asm_xtensa_t *as, uint reg_dest, uint32_t i32) {
+    // load the constant
+    uint32_t const_table_offset = (uint8_t*)as->const_table - as->base.code_base;
+    size_t loc = const_table_offset + as->cur_const * WORD_SIZE;
+    asm_xtensa_op_l32r(as, reg_dest, as->base.code_offset, loc);
+    // store the constant in the table
+    if (as->const_table != NULL) {
+        as->const_table[as->cur_const] = i32;
+    }
+    ++as->cur_const;
+    return loc;
+}
+
+void asm_xtensa_mov_reg_i32_optimised(asm_xtensa_t *as, uint reg_dest, uint32_t i32) {
     if (SIGNED_FIT12(i32)) {
         asm_xtensa_op_movi(as, reg_dest, i32);
     } else {
-        // load the constant
-        uint32_t const_table_offset = (uint8_t*)as->const_table - as->base.code_base;
-        asm_xtensa_op_l32r(as, reg_dest, as->base.code_offset, const_table_offset + as->cur_const * WORD_SIZE);
-        // store the constant in the table
-        if (as->const_table != NULL) {
-            as->const_table[as->cur_const] = i32;
-        }
-        ++as->cur_const;
+        asm_xtensa_mov_reg_i32(as, reg_dest, i32);
     }
 }
 
diff --git a/py/asmxtensa.h b/py/asmxtensa.h
index a595dc2b5af114c7f4a88c92a2879a9bb85a874c..d95af14a5dabb3fdfa5bcb1e8a514c2fc666d873 100644
--- a/py/asmxtensa.h
+++ b/py/asmxtensa.h
@@ -26,6 +26,7 @@
 #ifndef MICROPY_INCLUDED_PY_ASMXTENSA_H
 #define MICROPY_INCLUDED_PY_ASMXTENSA_H
 
+#include "py/misc.h"
 #include "py/asmbase.h"
 
 // calling conventions:
@@ -238,7 +239,8 @@ void asm_xtensa_j_label(asm_xtensa_t *as, uint label);
 void asm_xtensa_bccz_reg_label(asm_xtensa_t *as, uint cond, uint reg, uint label);
 void asm_xtensa_bcc_reg_reg_label(asm_xtensa_t *as, uint cond, uint reg1, uint reg2, uint label);
 void asm_xtensa_setcc_reg_reg_reg(asm_xtensa_t *as, uint cond, uint reg_dest, uint reg_src1, uint reg_src2);
-void asm_xtensa_mov_reg_i32(asm_xtensa_t *as, uint reg_dest, uint32_t i32);
+size_t asm_xtensa_mov_reg_i32(asm_xtensa_t *as, uint reg_dest, uint32_t i32);
+void asm_xtensa_mov_reg_i32_optimised(asm_xtensa_t *as, uint reg_dest, uint32_t i32);
 void asm_xtensa_mov_local_reg(asm_xtensa_t *as, int local_num, uint reg_src);
 void asm_xtensa_mov_reg_local(asm_xtensa_t *as, uint reg_dest, int local_num);
 void asm_xtensa_mov_reg_local_addr(asm_xtensa_t *as, uint reg_dest, int local_num);
@@ -289,7 +291,9 @@ void asm_xtensa_call_ind(asm_xtensa_t *as, uint idx);
 #define ASM_CALL_IND(as, idx) asm_xtensa_call_ind((as), (idx))
 
 #define ASM_MOV_LOCAL_REG(as, local_num, reg_src) asm_xtensa_mov_local_reg((as), (local_num), (reg_src))
-#define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_xtensa_mov_reg_i32((as), (reg_dest), (imm))
+#define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_xtensa_mov_reg_i32_optimised((as), (reg_dest), (imm))
+#define ASM_MOV_REG_IMM_FIX_U16(as, reg_dest, imm) asm_xtensa_mov_reg_i32((as), (reg_dest), (imm))
+#define ASM_MOV_REG_IMM_FIX_WORD(as, reg_dest, imm) asm_xtensa_mov_reg_i32((as), (reg_dest), (imm))
 #define ASM_MOV_REG_LOCAL(as, reg_dest, local_num) asm_xtensa_mov_reg_local((as), (reg_dest), (local_num))
 #define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_xtensa_op_mov_n((as), (reg_dest), (reg_src))
 #define ASM_MOV_REG_LOCAL_ADDR(as, reg_dest, local_num) asm_xtensa_mov_reg_local_addr((as), (reg_dest), (local_num))
diff --git a/py/compile.c b/py/compile.c
index 4609a50213631e151e88073b858e4f9f56c8d3ca..a38998fdb6668120c248daaf30e499cfe9098b1e 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -3266,7 +3266,11 @@ STATIC void compile_scope_inline_asm(compiler_t *comp, scope_t *scope, pass_kind
             void *f = mp_asm_base_get_code((mp_asm_base_t*)comp->emit_inline_asm);
             mp_emit_glue_assign_native(comp->scope_cur->raw_code, MP_CODE_NATIVE_ASM,
                 f, mp_asm_base_get_code_size((mp_asm_base_t*)comp->emit_inline_asm),
-                NULL, comp->scope_cur->num_pos_args, 0, type_sig);
+                NULL,
+                #if MICROPY_PERSISTENT_CODE_SAVE
+                0, 0, 0, 0, NULL,
+                #endif
+                comp->scope_cur->num_pos_args, 0, type_sig);
         }
     }
 
diff --git a/py/emitglue.c b/py/emitglue.c
index 996b79e1738665f7f6238f76b30ccc6f3de8b255..c073258f0167440fad575632f62c5b083f8295b3 100644
--- a/py/emitglue.c
+++ b/py/emitglue.c
@@ -89,8 +89,16 @@ void mp_emit_glue_assign_bytecode(mp_raw_code_t *rc, const byte *code,
 }
 
 #if MICROPY_EMIT_NATIVE || MICROPY_EMIT_INLINE_ASM
-void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void *fun_data, mp_uint_t fun_len, const mp_uint_t *const_table, mp_uint_t n_pos_args, mp_uint_t scope_flags, mp_uint_t type_sig) {
+void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void *fun_data, mp_uint_t fun_len, const mp_uint_t *const_table,
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    uint16_t prelude_offset,
+    uint16_t n_obj, uint16_t n_raw_code,
+    uint16_t n_qstr, mp_qstr_link_entry_t *qstr_link,
+    #endif
+    mp_uint_t n_pos_args, mp_uint_t scope_flags, mp_uint_t type_sig) {
+
     assert(kind == MP_CODE_NATIVE_PY || kind == MP_CODE_NATIVE_VIPER || kind == MP_CODE_NATIVE_ASM);
+
     rc->kind = kind;
     rc->scope_flags = scope_flags;
     rc->n_pos_args = n_pos_args;
@@ -98,6 +106,15 @@ void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void
     rc->const_table = const_table;
     rc->type_sig = type_sig;
 
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    rc->fun_data_len = fun_len;
+    rc->prelude_offset = prelude_offset;
+    rc->n_obj = n_obj;
+    rc->n_raw_code = n_raw_code;
+    rc->n_qstr= n_qstr;
+    rc->qstr_link = qstr_link;
+    #endif
+
 #ifdef DEBUG_PRINT
     DEBUG_printf("assign native: kind=%d fun=%p len=" UINT_FMT " n_pos_args=" UINT_FMT " flags=%x\n", kind, fun_data, fun_len, n_pos_args, (uint)scope_flags);
     for (mp_uint_t i = 0; i < fun_len; i++) {
diff --git a/py/emitglue.h b/py/emitglue.h
index 53049b161e9adf5e95eb09d11a7d208c38b79c56..058f060186476beb8b2f391007ac23b14dadf1ef 100644
--- a/py/emitglue.h
+++ b/py/emitglue.h
@@ -48,6 +48,11 @@ typedef enum {
     MP_CODE_NATIVE_ASM,
 } mp_raw_code_kind_t;
 
+typedef struct _mp_qstr_link_entry_t {
+    uint16_t off;
+    uint16_t qst;
+} mp_qstr_link_entry_t;
+
 typedef struct _mp_raw_code_t {
     mp_uint_t kind : 3; // of type mp_raw_code_kind_t
     mp_uint_t scope_flags : 7;
@@ -58,6 +63,11 @@ typedef struct _mp_raw_code_t {
     size_t fun_data_len;
     uint16_t n_obj;
     uint16_t n_raw_code;
+    #if MICROPY_EMIT_NATIVE || MICROPY_EMIT_INLINE_ASM
+    uint16_t prelude_offset;
+    uint16_t n_qstr;
+    mp_qstr_link_entry_t *qstr_link;
+    #endif
     #endif
     #if MICROPY_EMIT_NATIVE || MICROPY_EMIT_INLINE_ASM
     mp_uint_t type_sig; // for viper, compressed as 2-bit types; ret is MSB, then arg0, arg1, etc
@@ -75,7 +85,15 @@ void mp_emit_glue_assign_bytecode(mp_raw_code_t *rc, const byte *code,
     uint16_t n_obj, uint16_t n_raw_code,
     #endif
     mp_uint_t scope_flags);
-void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void *fun_data, mp_uint_t fun_len, const mp_uint_t *const_table, mp_uint_t n_pos_args, mp_uint_t scope_flags, mp_uint_t type_sig);
+
+void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void *fun_data, mp_uint_t fun_len,
+    const mp_uint_t *const_table,
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    uint16_t prelude_offset,
+    uint16_t n_obj, uint16_t n_raw_code,
+    uint16_t n_qstr, mp_qstr_link_entry_t *qstr_link,
+    #endif
+    mp_uint_t n_pos_args, mp_uint_t scope_flags, mp_uint_t type_sig);
 
 mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, mp_obj_t def_args, mp_obj_t def_kw_args);
 mp_obj_t mp_make_closure_from_raw_code(const mp_raw_code_t *rc, mp_uint_t n_closed_over, const mp_obj_t *args);
diff --git a/py/emitnative.c b/py/emitnative.c
index 8d7c93af1e42fc74dc696d86833d62982c5d8058..ffdebd643234543105204a5bc9ae7274b95567d4 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -214,6 +214,11 @@ struct _emit_t {
     uint16_t const_table_cur_raw_code;
     mp_uint_t *const_table;
 
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    uint16_t qstr_link_cur;
+    mp_qstr_link_entry_t *qstr_link;
+    #endif
+
     bool last_emit_was_return_value;
 
     scope_t *scope;
@@ -225,6 +230,7 @@ STATIC const uint8_t reg_local_table[REG_LOCAL_NUM] = {REG_LOCAL_1, REG_LOCAL_2,
 
 STATIC void emit_native_global_exc_entry(emit_t *emit);
 STATIC void emit_native_global_exc_exit(emit_t *emit);
+STATIC void emit_native_load_const_obj(emit_t *emit, mp_obj_t obj);
 
 emit_t *EXPORT_FUN(new)(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels) {
     emit_t *emit = m_new0(emit_t, 1);
@@ -280,11 +286,29 @@ STATIC void emit_native_mov_reg_state_addr(emit_t *emit, int reg_dest, int local
 }
 
 STATIC void emit_native_mov_reg_qstr(emit_t *emit, int arg_reg, qstr qst) {
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    size_t loc = ASM_MOV_REG_IMM_FIX_U16(emit->as, arg_reg, qst);
+    size_t link_idx = emit->qstr_link_cur++;
+    if (emit->pass == MP_PASS_EMIT) {
+        emit->qstr_link[link_idx].off = loc << 2 | 1;
+        emit->qstr_link[link_idx].qst = qst;
+    }
+    #else
     ASM_MOV_REG_IMM(emit->as, arg_reg, qst);
+    #endif
 }
 
 STATIC void emit_native_mov_reg_qstr_obj(emit_t *emit, int reg_dest, qstr qst) {
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    size_t loc = ASM_MOV_REG_IMM_FIX_WORD(emit->as, reg_dest, (mp_uint_t)MP_OBJ_NEW_QSTR(qst));
+    size_t link_idx = emit->qstr_link_cur++;
+    if (emit->pass == MP_PASS_EMIT) {
+        emit->qstr_link[link_idx].off = loc << 2 | 2;
+        emit->qstr_link[link_idx].qst = qst;
+    }
+    #else
     ASM_MOV_REG_IMM(emit->as, reg_dest, (mp_uint_t)MP_OBJ_NEW_QSTR(qst));
+    #endif
 }
 
 #define emit_native_mov_state_imm_via(emit, local_num, imm, reg_temp) \
@@ -301,6 +325,9 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
     emit->stack_size = 0;
     emit->const_table_cur_obj = 0;
     emit->const_table_cur_raw_code = 0;
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    emit->qstr_link_cur = 0;
+    #endif
     emit->last_emit_was_return_value = false;
     emit->scope = scope;
 
@@ -598,6 +625,13 @@ STATIC void emit_native_end_pass(emit_t *emit) {
         emit->const_table = m_new(mp_uint_t, const_table_alloc);
         // Store mp_fun_table pointer just after qstrs
         emit->const_table[nqstr] = (mp_uint_t)(uintptr_t)mp_fun_table;
+
+        #if MICROPY_PERSISTENT_CODE_SAVE
+        size_t qstr_link_alloc = emit->qstr_link_cur;
+        if (qstr_link_alloc > 0) {
+            emit->qstr_link = m_new(mp_qstr_link_entry_t, qstr_link_alloc);
+        }
+        #endif
     }
 
     if (emit->pass == MP_PASS_EMIT) {
@@ -607,6 +641,11 @@ STATIC void emit_native_end_pass(emit_t *emit) {
         mp_emit_glue_assign_native(emit->scope->raw_code,
             emit->do_viper_types ? MP_CODE_NATIVE_VIPER : MP_CODE_NATIVE_PY,
             f, f_len, emit->const_table,
+            #if MICROPY_PERSISTENT_CODE_SAVE
+            emit->prelude_offset,
+            emit->const_table_cur_obj, emit->const_table_cur_raw_code,
+            emit->qstr_link_cur, emit->qstr_link,
+            #endif
             emit->scope->num_pos_args, emit->scope->scope_flags, 0);
     }
 }
@@ -1233,7 +1272,11 @@ STATIC void emit_native_import(emit_t *emit, qstr qst, int kind) {
 STATIC void emit_native_load_const_tok(emit_t *emit, mp_token_kind_t tok) {
     DEBUG_printf("load_const_tok(tok=%u)\n", tok);
     if (tok == MP_TOKEN_ELLIPSIS) {
+        #if MICROPY_PERSISTENT_CODE_SAVE
+        emit_native_load_const_obj(emit, MP_OBJ_FROM_PTR(&mp_const_ellipsis_obj));
+        #else
         emit_post_push_imm(emit, VTYPE_PYOBJ, (mp_uint_t)MP_OBJ_FROM_PTR(&mp_const_ellipsis_obj));
+        #endif
     } else {
         emit_native_pre(emit);
         if (tok == MP_TOKEN_KW_NONE) {
diff --git a/py/persistentcode.c b/py/persistentcode.c
index d4ca50e81fe39abcbf3443f4684f5666441f3c2c..78849fedff00835719130f137f7494b379a55084 100644
--- a/py/persistentcode.c
+++ b/py/persistentcode.c
@@ -47,6 +47,10 @@
 #define MPY_FEATURE_ENCODE_FLAGS(flags) (flags)
 #define MPY_FEATURE_DECODE_FLAGS(feat) ((feat) & 3)
 
+// Macros to encode/decode native architecture to/from the feature byte
+#define MPY_FEATURE_ENCODE_ARCH(arch) ((arch) << 2)
+#define MPY_FEATURE_DECODE_ARCH(feat) ((feat) >> 2)
+
 // The feature flag bits encode the compile-time config options that
 // affect the generate bytecode.
 #define MPY_FEATURE_FLAGS ( \
@@ -59,6 +63,21 @@
     | ((MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) << 1) \
     )
 
+// Define the host architecture
+#if MICROPY_EMIT_X86
+#define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_X86)
+#elif MICROPY_EMIT_X64
+#define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_X64)
+#elif MICROPY_EMIT_THUMB
+#define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_ARMV7M)
+#elif MICROPY_EMIT_ARM
+#define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_ARMV6)
+#elif MICROPY_EMIT_XTENSA
+#define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_XTENSA)
+#else
+#define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_NONE)
+#endif
+
 #if MICROPY_PERSISTENT_CODE_LOAD || (MICROPY_PERSISTENT_CODE_SAVE && !MICROPY_DYNAMIC_COMPILER)
 // The bytecode will depend on the number of bits in a small-int, and
 // this function computes that (could make it a fixed constant, but it
@@ -135,7 +154,7 @@ typedef struct _bytecode_prelude_t {
     uint code_info_size;
 } bytecode_prelude_t;
 
-#if MICROPY_PERSISTENT_CODE_SAVE
+#if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_EMIT_NATIVE
 
 // ip will point to start of opcodes
 // ip2 will point to simple_name, source_file qstrs
@@ -161,6 +180,42 @@ STATIC void extract_prelude(const byte **ip, const byte **ip2, bytecode_prelude_
 
 #include "py/parsenum.h"
 
+#if MICROPY_EMIT_NATIVE
+
+#if MICROPY_EMIT_THUMB
+STATIC void asm_thumb_rewrite_mov(uint8_t *pc, uint16_t val) {
+    // high part
+    *(uint16_t*)pc = (*(uint16_t*)pc & 0xfbf0) | (val >> 1 & 0x0400) | (val >> 12);
+    // low part
+    *(uint16_t*)(pc + 2) = (*(uint16_t*)(pc + 2) & 0x0f00) | (val << 4 & 0x7000) | (val & 0x00ff);
+
+}
+#endif
+
+STATIC void arch_link_qstr(uint8_t *pc, bool is_obj, qstr qst) {
+    mp_uint_t val = qst;
+    if (is_obj) {
+        val = (mp_uint_t)MP_OBJ_NEW_QSTR(qst);
+    }
+    #if MICROPY_EMIT_X86 || MICROPY_EMIT_X64 || MICROPY_EMIT_ARM || MICROPY_EMIT_XTENSA
+    pc[0] = val & 0xff;
+    pc[1] = (val >> 8) & 0xff;
+    pc[2] = (val >> 16) & 0xff;
+    pc[3] = (val >> 24) & 0xff;
+    #elif MICROPY_EMIT_THUMB
+    if (is_obj) {
+        // qstr object, movw and movt
+        asm_thumb_rewrite_mov(pc, val); // movw
+        asm_thumb_rewrite_mov(pc + 4, val >> 16); // movt
+    } else {
+        // qstr number, movw instruction
+        asm_thumb_rewrite_mov(pc, val); // movw
+    }
+    #endif
+}
+
+#endif
+
 STATIC int read_byte(mp_reader_t *reader) {
     return reader->readbyte(reader->data);
 }
@@ -264,51 +319,155 @@ STATIC void load_bytecode(mp_reader_t *reader, qstr_window_t *qw, byte *ip, byte
 }
 
 STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader, qstr_window_t *qw) {
-    // get bytecode size and allocate memory for it
-    size_t bc_len = read_uint(reader, NULL);
-    byte *bytecode = m_new(byte, bc_len);
+    // Load function kind and data length
+    size_t kind_len = read_uint(reader, NULL);
+    int kind = (kind_len & 3) + MP_CODE_BYTECODE;
+    size_t fun_data_len = kind_len >> 2;
 
-    // load prelude
-    byte *ip = bytecode;
+    #if !MICROPY_EMIT_NATIVE
+    if (kind != MP_CODE_BYTECODE) {
+        mp_raise_ValueError("incompatible .mpy file");
+    }
+    #endif
+
+    uint8_t *fun_data = NULL;
     byte *ip2;
-    bytecode_prelude_t prelude;
-    load_prelude(reader, &ip, &ip2, &prelude);
-
-    // load bytecode
-    load_bytecode(reader, qw, ip, bytecode + bc_len);
-
-    // load qstrs and link global qstr ids into bytecode
-    qstr simple_name = load_qstr(reader, qw);
-    qstr source_file = load_qstr(reader, qw);
-    ((byte*)ip2)[0] = simple_name; ((byte*)ip2)[1] = simple_name >> 8;
-    ((byte*)ip2)[2] = source_file; ((byte*)ip2)[3] = source_file >> 8;
-
-    // load constant table
-    size_t n_obj = read_uint(reader, NULL);
-    size_t n_raw_code = read_uint(reader, NULL);
-    mp_uint_t *const_table = m_new(mp_uint_t, prelude.n_pos_args + prelude.n_kwonly_args + n_obj + n_raw_code);
-    mp_uint_t *ct = const_table;
-    for (size_t i = 0; i < prelude.n_pos_args + prelude.n_kwonly_args; ++i) {
-        *ct++ = (mp_uint_t)MP_OBJ_NEW_QSTR(load_qstr(reader, qw));
+    bytecode_prelude_t prelude = {0};
+    #if MICROPY_EMIT_NATIVE
+    size_t prelude_offset;
+    mp_uint_t type_sig = 0;
+    size_t n_qstr_link = 0;
+    #endif
+
+    if (kind == MP_CODE_BYTECODE) {
+        // Allocate memory for the bytecode
+        fun_data = m_new(uint8_t, fun_data_len);
+
+        // Load prelude
+        byte *ip = fun_data;
+        load_prelude(reader, &ip, &ip2, &prelude);
+
+        // Load bytecode
+        load_bytecode(reader, qw, ip, fun_data + fun_data_len);
+
+    #if MICROPY_EMIT_NATIVE
+    } else {
+        // Allocate memory for native data and load it
+        size_t fun_alloc;
+        MP_PLAT_ALLOC_EXEC(fun_data_len, (void**)&fun_data, &fun_alloc);
+        read_bytes(reader, fun_data, fun_data_len);
+
+        if (kind == MP_CODE_NATIVE_PY || kind == MP_CODE_NATIVE_VIPER) {
+            // Parse qstr link table and link native code
+            n_qstr_link = read_uint(reader, NULL);
+            for (size_t i = 0; i < n_qstr_link; ++i) {
+                size_t off = read_uint(reader, NULL);
+                qstr qst = load_qstr(reader, qw);
+                uint8_t *dest = fun_data + (off >> 2);
+                if ((off & 3) == 0) {
+                    // Generic 16-bit link
+                    dest[0] = qst & 0xff;
+                    dest[1] = (qst >> 8) & 0xff;
+                } else {
+                    // Architecture-specific link
+                    arch_link_qstr(dest, (off & 3) == 2, qst);
+                }
+            }
+        }
+
+        if (kind == MP_CODE_NATIVE_PY) {
+            // Extract prelude for later use
+            prelude_offset = read_uint(reader, NULL);
+            const byte *ip = fun_data + prelude_offset;
+            extract_prelude(&ip, (const byte**)&ip2, &prelude);
+        } else {
+            // Load basic scope info for viper and asm
+            prelude.scope_flags = read_uint(reader, NULL);
+            prelude.n_pos_args = 0;
+            prelude.n_kwonly_args = 0;
+            if (kind == MP_CODE_NATIVE_ASM) {
+                prelude.n_pos_args = read_uint(reader, NULL);
+                type_sig = read_uint(reader, NULL);
+            }
+        }
+    #endif
     }
-    for (size_t i = 0; i < n_obj; ++i) {
-        *ct++ = (mp_uint_t)load_obj(reader);
+
+    if (kind == MP_CODE_BYTECODE || kind == MP_CODE_NATIVE_PY) {
+        // Load qstrs in prelude
+        qstr simple_name = load_qstr(reader, qw);
+        qstr source_file = load_qstr(reader, qw);
+        ip2[0] = simple_name; ip2[1] = simple_name >> 8;
+        ip2[2] = source_file; ip2[3] = source_file >> 8;
     }
-    for (size_t i = 0; i < n_raw_code; ++i) {
-        *ct++ = (mp_uint_t)(uintptr_t)load_raw_code(reader, qw);
+
+    mp_uint_t *const_table = NULL;
+    if (kind != MP_CODE_NATIVE_ASM) {
+        // Load constant table for bytecode, native and viper
+
+        // Number of entries in constant table
+        size_t n_obj = read_uint(reader, NULL);
+        size_t n_raw_code = read_uint(reader, NULL);
+
+        // Allocate constant table
+        size_t n_alloc = prelude.n_pos_args + prelude.n_kwonly_args + n_obj + n_raw_code;
+        if (kind != MP_CODE_BYTECODE) {
+            ++n_alloc; // additional entry for mp_fun_table
+        }
+        const_table = m_new(mp_uint_t, n_alloc);
+        mp_uint_t *ct = const_table;
+
+        // Load function argument names (initial entries in const_table)
+        // (viper has n_pos_args=n_kwonly_args=0 so doesn't load any qstrs here)
+        for (size_t i = 0; i < prelude.n_pos_args + prelude.n_kwonly_args; ++i) {
+            *ct++ = (mp_uint_t)MP_OBJ_NEW_QSTR(load_qstr(reader, qw));
+        }
+
+        #if MICROPY_EMIT_NATIVE
+        if (kind != MP_CODE_BYTECODE) {
+            // Populate mp_fun_table entry
+            *ct++ = (mp_uint_t)(uintptr_t)mp_fun_table;
+        }
+        #endif
+
+        // Load constant objects and raw code children
+        for (size_t i = 0; i < n_obj; ++i) {
+            *ct++ = (mp_uint_t)load_obj(reader);
+        }
+        for (size_t i = 0; i < n_raw_code; ++i) {
+            *ct++ = (mp_uint_t)(uintptr_t)load_raw_code(reader, qw);
+        }
     }
 
-    // create raw_code and return it
+    // Create raw_code and return it
     mp_raw_code_t *rc = mp_emit_glue_new_raw_code();
-    mp_emit_glue_assign_bytecode(rc, bytecode,
-        #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
-        bc_len,
-        #endif
-        const_table,
-        #if MICROPY_PERSISTENT_CODE_SAVE
-        n_obj, n_raw_code,
+    if (kind == MP_CODE_BYTECODE) {
+        mp_emit_glue_assign_bytecode(rc, fun_data,
+            #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
+            fun_data_len,
+            #endif
+            const_table,
+            #if MICROPY_PERSISTENT_CODE_SAVE
+            n_obj, n_raw_code,
+            #endif
+            prelude.scope_flags);
+
+    #if MICROPY_EMIT_NATIVE
+    } else {
+        #if defined(MP_PLAT_COMMIT_EXEC)
+        fun_data = MP_PLAT_COMMIT_EXEC(fun_data, fun_data_len);
         #endif
-        prelude.scope_flags);
+
+        mp_emit_glue_assign_native(rc, kind,
+            fun_data, fun_data_len, const_table,
+            #if MICROPY_PERSISTENT_CODE_SAVE
+            prelude_offset,
+            n_obj, n_raw_code,
+            n_qstr_link, NULL,
+            #endif
+            prelude.n_pos_args, prelude.scope_flags, type_sig);
+    #endif
+    }
     return rc;
 }
 
@@ -322,6 +481,10 @@ mp_raw_code_t *mp_raw_code_load(mp_reader_t *reader) {
         || read_uint(reader, NULL) > QSTR_WINDOW_SIZE) {
         mp_raise_ValueError("incompatible .mpy file");
     }
+    if (MPY_FEATURE_DECODE_ARCH(header[2]) != MP_NATIVE_ARCH_NONE
+        && MPY_FEATURE_DECODE_ARCH(header[2]) != MPY_FEATURE_ARCH) {
+        mp_raise_ValueError("incompatible .mpy arch");
+    }
     qstr_window_t qw;
     qw.idx = 0;
     mp_raw_code_t *rc = load_raw_code(reader, &qw);
@@ -444,43 +607,110 @@ STATIC void save_bytecode(mp_print_t *print, qstr_window_t *qw, const byte *ip,
 }
 
 STATIC void save_raw_code(mp_print_t *print, mp_raw_code_t *rc, qstr_window_t *qstr_window) {
+    // Save function kind and data length
+    mp_print_uint(print, (rc->fun_data_len << 2) | (rc->kind - MP_CODE_BYTECODE));
+
+    const byte *ip2;
+    bytecode_prelude_t prelude;
+
+    if (rc->kind == MP_CODE_BYTECODE) {
+        // Save prelude
+        const byte *ip = rc->fun_data;
+        extract_prelude(&ip, &ip2, &prelude);
+        size_t prelude_len = ip - (const byte*)rc->fun_data;
+        const byte *ip_top = (const byte*)rc->fun_data + rc->fun_data_len;
+        mp_print_bytes(print, rc->fun_data, prelude_len);
+
+        // Save bytecode
+        save_bytecode(print, qstr_window, ip, ip_top);
+    } else {
+        // Save native code
+        mp_print_bytes(print, rc->fun_data, rc->fun_data_len);
+
+        if (rc->kind == MP_CODE_NATIVE_PY || rc->kind == MP_CODE_NATIVE_VIPER) {
+            // Save qstr link table for native code
+            mp_print_uint(print, rc->n_qstr);
+            for (size_t i = 0; i < rc->n_qstr; ++i) {
+                mp_print_uint(print, rc->qstr_link[i].off);
+                save_qstr(print, qstr_window, rc->qstr_link[i].qst);
+            }
+        }
+
+        if (rc->kind == MP_CODE_NATIVE_PY) {
+            // Save prelude size, and extract prelude for later use
+            mp_print_uint(print, rc->prelude_offset);
+            const byte *ip = (const byte*)rc->fun_data + rc->prelude_offset;
+            extract_prelude(&ip, &ip2, &prelude);
+        } else {
+            // Save basic scope info for viper and asm
+            mp_print_uint(print, rc->scope_flags);
+            prelude.n_pos_args = 0;
+            prelude.n_kwonly_args = 0;
+            if (rc->kind == MP_CODE_NATIVE_ASM) {
+                mp_print_uint(print, rc->n_pos_args);
+                mp_print_uint(print, rc->type_sig);
+            }
+        }
+    }
+
+    if (rc->kind == MP_CODE_BYTECODE || rc->kind == MP_CODE_NATIVE_PY) {
+        // Save qstrs in prelude
+        save_qstr(print, qstr_window, ip2[0] | (ip2[1] << 8)); // simple_name
+        save_qstr(print, qstr_window, ip2[2] | (ip2[3] << 8)); // source_file
+    }
+
+    if (rc->kind != MP_CODE_NATIVE_ASM) {
+        // Save constant table for bytecode, native and viper
+
+        // Number of entries in constant table
+        mp_print_uint(print, rc->n_obj);
+        mp_print_uint(print, rc->n_raw_code);
+
+        const mp_uint_t *const_table = rc->const_table;
+
+        // Save function argument names (initial entries in const_table)
+        // (viper has n_pos_args=n_kwonly_args=0 so doesn't save any qstrs here)
+        for (size_t i = 0; i < prelude.n_pos_args + prelude.n_kwonly_args; ++i) {
+            mp_obj_t o = (mp_obj_t)*const_table++;
+            save_qstr(print, qstr_window, MP_OBJ_QSTR_VALUE(o));
+        }
+
+        if (rc->kind != MP_CODE_BYTECODE) {
+            // Skip saving mp_fun_table entry
+            ++const_table;
+        }
+
+        // Save constant objects and raw code children
+        for (size_t i = 0; i < rc->n_obj; ++i) {
+            save_obj(print, (mp_obj_t)*const_table++);
+        }
+        for (size_t i = 0; i < rc->n_raw_code; ++i) {
+            save_raw_code(print, (mp_raw_code_t*)(uintptr_t)*const_table++, qstr_window);
+        }
+    }
+}
+
+STATIC bool mp_raw_code_has_native(mp_raw_code_t *rc) {
     if (rc->kind != MP_CODE_BYTECODE) {
-        mp_raise_ValueError("can only save bytecode");
+        return true;
     }
 
-    // extract prelude
     const byte *ip = rc->fun_data;
     const byte *ip2;
     bytecode_prelude_t prelude;
     extract_prelude(&ip, &ip2, &prelude);
 
-    // save prelude
-    size_t prelude_len = ip - rc->fun_data;
-    const byte *ip_top = rc->fun_data + rc->fun_data_len;
-    mp_print_uint(print, rc->fun_data_len);
-    mp_print_bytes(print, rc->fun_data, prelude_len);
-
-    // save bytecode
-    save_bytecode(print, qstr_window, ip, ip_top);
-
-    // save qstrs
-    save_qstr(print, qstr_window, ip2[0] | (ip2[1] << 8)); // simple_name
-    save_qstr(print, qstr_window, ip2[2] | (ip2[3] << 8)); // source_file
-
-    // save constant table
-    mp_print_uint(print, rc->n_obj);
-    mp_print_uint(print, rc->n_raw_code);
-    const mp_uint_t *const_table = rc->const_table;
-    for (uint i = 0; i < prelude.n_pos_args + prelude.n_kwonly_args; ++i) {
-        mp_obj_t o = (mp_obj_t)*const_table++;
-        save_qstr(print, qstr_window, MP_OBJ_QSTR_VALUE(o));
-    }
-    for (uint i = 0; i < rc->n_obj; ++i) {
-        save_obj(print, (mp_obj_t)*const_table++);
-    }
-    for (uint i = 0; i < rc->n_raw_code; ++i) {
-        save_raw_code(print, (mp_raw_code_t*)(uintptr_t)*const_table++, qstr_window);
+    const mp_uint_t *const_table = rc->const_table
+        + prelude.n_pos_args + prelude.n_kwonly_args
+        + rc->n_obj;
+
+    for (size_t i = 0; i < rc->n_raw_code; ++i) {
+        if (mp_raw_code_has_native((mp_raw_code_t*)(uintptr_t)*const_table++)) {
+            return true;
+        }
     }
+
+    return false;
 }
 
 void mp_raw_code_save(mp_raw_code_t *rc, mp_print_t *print) {
@@ -500,6 +730,9 @@ void mp_raw_code_save(mp_raw_code_t *rc, mp_print_t *print) {
         mp_small_int_bits(),
         #endif
     };
+    if (mp_raw_code_has_native(rc)) {
+        header[2] |= MPY_FEATURE_ENCODE_ARCH(MPY_FEATURE_ARCH);
+    }
     mp_print_bytes(print, header, sizeof(header));
     mp_print_uint(print, QSTR_WINDOW_SIZE);
 
diff --git a/py/persistentcode.h b/py/persistentcode.h
index d04e0b633089ffe5346173f85be5fc36069c89f6..b27c3de2f046abe20dcaefafd7346891574035b8 100644
--- a/py/persistentcode.h
+++ b/py/persistentcode.h
@@ -30,6 +30,19 @@
 #include "py/reader.h"
 #include "py/emitglue.h"
 
+enum {
+    MP_NATIVE_ARCH_NONE = 0,
+    MP_NATIVE_ARCH_X86,
+    MP_NATIVE_ARCH_X64,
+    MP_NATIVE_ARCH_ARMV6,
+    MP_NATIVE_ARCH_ARMV6M,
+    MP_NATIVE_ARCH_ARMV7M,
+    MP_NATIVE_ARCH_ARMV7EM,
+    MP_NATIVE_ARCH_ARMV7EMSP,
+    MP_NATIVE_ARCH_ARMV7EMDP,
+    MP_NATIVE_ARCH_XTENSA,
+};
+
 mp_raw_code_t *mp_raw_code_load(mp_reader_t *reader);
 mp_raw_code_t *mp_raw_code_load_mem(const byte *buf, size_t len);
 mp_raw_code_t *mp_raw_code_load_file(const char *filename);