diff --git a/py/bc.h b/py/bc.h
index 1a5bcd9b35862b23f89151599a231a43f921d9df..7ea2235f70928fe306ed6969df74743888b34d49 100644
--- a/py/bc.h
+++ b/py/bc.h
@@ -93,4 +93,5 @@
 #define PYBC_IMPORT_FROM (0xe1)
 #define PYBC_IMPORT_STAR (0xe2)
 
-py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args, uint n_args);
+py_obj_t py_execute_byte_code(const byte *code, const py_obj_t *args, uint n_args);
+bool py_execute_byte_code_2(const byte *code, const byte **ip_in_out, py_obj_t *fastn, py_obj_t **sp_in_out);
diff --git a/py/emitbc.c b/py/emitbc.c
index 8f28910c089afabf35242a41f65395a35e09927a..06c52a0dc1eee938be214a6a7d2cf996a58de203 100644
--- a/py/emitbc.c
+++ b/py/emitbc.c
@@ -76,7 +76,7 @@ static void emit_bc_end_pass(emit_t *emit) {
         printf("code_size: %u\n", emit->code_size);
 
     } else if (emit->pass == PASS_3) {
-        rt_assign_byte_code(emit->scope->unique_code_id, emit->code_base, emit->code_size, emit->scope->num_params);
+        rt_assign_byte_code(emit->scope->unique_code_id, emit->code_base, emit->code_size, emit->scope->num_params, emit->scope->num_locals, emit->scope->stack_size, (emit->scope->flags & SCOPE_FLAG_GENERATOR) != 0);
     }
 }
 
diff --git a/py/runtime.c b/py/runtime.c
index e80791df3388d1ab1cd19c8c51f87de469846586..0b76df8a5b60ae3089cccc96aa8976b21638da08 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -1,5 +1,6 @@
 // in principle, rt_xxx functions are called only by vm/native/viper and make assumptions about args
 // py_xxx functions are safer and can be called by anyone
+// note that rt_assign_xxx are called only from emit*, and maybe we can rename them to reflect this
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -50,6 +51,8 @@ typedef enum {
     O_FUN_N,
     O_FUN_BC,
     O_FUN_ASM,
+    O_GEN_WRAP,
+    O_GEN_INSTANCE,
     O_BOUND_METH,
     O_TUPLE,
     O_LIST,
@@ -123,6 +126,15 @@ struct _py_obj_base_t {
             int n_args;
             void *fun;
         } u_fun_asm;
+        struct { // for O_GEN_WRAP
+            int n_state;
+            py_obj_base_t *fun;
+        } u_gen_wrap;
+        struct { // for O_GEN_INSTANCE
+            py_obj_t *state;
+            const byte *ip;
+            py_obj_t *sp;
+        } u_gen_instance;
         struct { // for O_BOUND_METH
             py_obj_t meth;
             py_obj_t self;
@@ -367,10 +379,20 @@ py_obj_t rt_list_append(py_obj_t self_in, py_obj_t arg) {
     return arg;
 }
 
+py_obj_t rt_gen_instance_next(py_obj_t self_in) {
+    py_obj_t ret = rt_iternext(self_in);
+    if (ret == py_const_stop_iteration) {
+        nlr_jump(py_obj_new_exception_0(qstr_from_str_static("StopIteration")));
+    } else {
+        return ret;
+    }
+}
+
 static qstr q_append;
 static qstr q_print;
 static qstr q_len;
 static qstr q___build_class__;
+static qstr q___next__;
 static qstr q_AttributeError;
 static qstr q_IndexError;
 static qstr q_NameError;
@@ -386,6 +408,9 @@ typedef enum {
 typedef struct _py_code_t {
     py_code_kind_t kind;
     int n_args;
+    int n_locals;
+    int n_stack;
+    bool is_generator;
     union {
         struct {
             byte *code;
@@ -404,6 +429,7 @@ static int next_unique_code_id;
 static py_code_t *unique_codes;
 
 py_obj_t fun_list_append;
+py_obj_t fun_gen_instance_next;
 
 py_obj_t py_builtin_print(py_obj_t o) {
     if (IS_O(o, O_STR)) {
@@ -463,6 +489,7 @@ void rt_init() {
     q_print = qstr_from_str_static("print");
     q_len = qstr_from_str_static("len");
     q___build_class__ = qstr_from_str_static("__build_class__");
+    q___next__ = qstr_from_str_static("__next__");
     q_AttributeError = qstr_from_str_static("AttributeError");
     q_IndexError = qstr_from_str_static("IndexError");
     q_NameError = qstr_from_str_static("NameError");
@@ -487,6 +514,7 @@ void rt_init() {
     unique_codes = NULL;
 
     fun_list_append = rt_make_function_2(rt_list_append);
+    fun_gen_instance_next = rt_make_function_1(rt_gen_instance_next);
 
 #ifdef WRITE_NATIVE
     fp_native = fopen("out-native", "wb");
@@ -514,12 +542,15 @@ static void alloc_unique_codes() {
     }
 }
 
-void rt_assign_byte_code(int unique_code_id, byte *code, uint len, int n_args) {
+void rt_assign_byte_code(int unique_code_id, byte *code, uint len, int n_args, int n_locals, int n_stack, bool is_generator) {
     alloc_unique_codes();
 
     assert(unique_code_id < next_unique_code_id);
     unique_codes[unique_code_id].kind = PY_CODE_BYTE;
     unique_codes[unique_code_id].n_args = n_args;
+    unique_codes[unique_code_id].n_locals = n_locals;
+    unique_codes[unique_code_id].n_stack = n_stack;
+    unique_codes[unique_code_id].is_generator = is_generator;
     unique_codes[unique_code_id].u_byte.code = code;
     unique_codes[unique_code_id].u_byte.len = len;
 
@@ -532,6 +563,9 @@ void rt_assign_native_code(int unique_code_id, py_fun_t fun, uint len, int n_arg
     assert(1 <= unique_code_id && unique_code_id < next_unique_code_id);
     unique_codes[unique_code_id].kind = PY_CODE_NATIVE;
     unique_codes[unique_code_id].n_args = n_args;
+    unique_codes[unique_code_id].n_locals = 0;
+    unique_codes[unique_code_id].n_stack = 0;
+    unique_codes[unique_code_id].is_generator = false;
     unique_codes[unique_code_id].u_native.fun = fun;
 
 #ifdef DEBUG_PRINT
@@ -560,6 +594,9 @@ void rt_assign_inline_asm_code(int unique_code_id, py_fun_t fun, uint len, int n
     assert(1 <= unique_code_id && unique_code_id < next_unique_code_id);
     unique_codes[unique_code_id].kind = PY_CODE_INLINE_ASM;
     unique_codes[unique_code_id].n_args = n_args;
+    unique_codes[unique_code_id].n_locals = 0;
+    unique_codes[unique_code_id].n_stack = 0;
+    unique_codes[unique_code_id].is_generator = false;
     unique_codes[unique_code_id].u_inline_asm.fun = fun;
 
 #ifdef DEBUG_PRINT
@@ -625,6 +662,8 @@ const char *py_obj_get_type_str(py_obj_t o_in) {
             case O_FUN_N:
             case O_FUN_BC:
                 return "function";
+            case O_GEN_INSTANCE:
+                return "generator";
             case O_TUPLE:
                 return "tuple";
             case O_LIST:
@@ -669,10 +708,16 @@ void py_obj_print(py_obj_t o_in) {
                 printf("%f", o->u_flt);
                 break;
 #endif
+            case O_EXCEPTION_0:
+                printf("%s", qstr_str(o->u_exc0.id));
+                break;
             case O_EXCEPTION_2:
                 printf("%s: ", qstr_str(o->u_exc2.id));
                 printf(o->u_exc2.fmt, o->u_exc2.s1, o->u_exc2.s2);
                 break;
+            case O_GEN_INSTANCE:
+                printf("<generator object 'fun-name' at %p>", o);
+                break;
             case O_TUPLE:
                 printf("(");
                 for (int i = 0; i < o->u_tuple_list.len; i++) {
@@ -861,7 +906,8 @@ py_obj_t rt_binary_op(int op, py_obj_t lhs, py_obj_t rhs) {
         switch (op) {
             case RT_BINARY_OP_ADD:
             case RT_BINARY_OP_INPLACE_ADD: val = FROM_SMALL_INT(lhs) + FROM_SMALL_INT(rhs); break;
-            case RT_BINARY_OP_SUBTRACT: val = FROM_SMALL_INT(lhs) - FROM_SMALL_INT(rhs); break;
+            case RT_BINARY_OP_SUBTRACT:
+            case RT_BINARY_OP_INPLACE_SUBTRACT: val = FROM_SMALL_INT(lhs) - FROM_SMALL_INT(rhs); break;
             case RT_BINARY_OP_MULTIPLY: val = FROM_SMALL_INT(lhs) * FROM_SMALL_INT(rhs); break;
             case RT_BINARY_OP_FLOOR_DIVIDE: val = FROM_SMALL_INT(lhs) / FROM_SMALL_INT(rhs); break;
 #if MICROPY_ENABLE_FLOAT
@@ -938,6 +984,17 @@ py_obj_t rt_make_function_from_id(int unique_code_id) {
         default:
             assert(0);
     }
+
+    // check for generator functions and if so wrap in generator object
+    if (c->is_generator) {
+        py_obj_base_t *o2 = m_new(py_obj_base_t, 1);
+        o2->kind = O_GEN_WRAP;
+        // we have at least 3 locals so the bc can write back fast[0,1,2] safely; should improve how this is done
+        o2->u_gen_wrap.n_state = (c->n_locals < 3 ? 3 : c->n_locals) + c->n_stack;
+        o2->u_gen_wrap.fun = o;
+        o = o2;
+    }
+
     return o;
 }
 
@@ -1071,7 +1128,7 @@ py_obj_t rt_call_function_n(py_obj_t fun, int n_args, const py_obj_t *args) {
             goto bad_n_args;
         }
         DEBUG_OP_printf("calling byte code %p(n_args=%d)\n", o->u_fun_bc.code, n_args);
-        return py_execute_byte_code(o->u_fun_bc.code, o->u_fun_bc.len, args, n_args);
+        return py_execute_byte_code(o->u_fun_bc.code, args, n_args);
 
     } else if (IS_O(fun, O_FUN_ASM)) {
         py_obj_base_t *o = fun;
@@ -1095,6 +1152,28 @@ py_obj_t rt_call_function_n(py_obj_t fun, int n_args, const py_obj_t *args) {
         }
         return rt_convert_val_from_inline_asm(ret);
 
+    } else if (IS_O(fun, O_GEN_WRAP)) {
+        py_obj_base_t *o = fun;
+        py_obj_base_t *o_fun = o->u_gen_wrap.fun;
+        assert(o_fun->kind == O_FUN_BC); // TODO
+        if (n_args != o_fun->u_fun_bc.n_args) {
+            n_args_fun = o_fun->u_fun_bc.n_args;
+            goto bad_n_args;
+        }
+        py_obj_t *state = m_new(py_obj_t, 1 + o->u_gen_wrap.n_state);
+        // put function object at first slot in state (to keep u_gen_instance small)
+        state[0] = o_fun;
+        // init args
+        for (int i = 0; i < n_args; i++) {
+            state[1 + i] = args[n_args - 1 - i];
+        }
+        py_obj_base_t *o2 = m_new(py_obj_base_t, 1);
+        o2->kind = O_GEN_INSTANCE;
+        o2->u_gen_instance.state = state;
+        o2->u_gen_instance.ip = o_fun->u_fun_bc.code;
+        o2->u_gen_instance.sp = state + o->u_gen_wrap.n_state;
+        return o2;
+
     } else if (IS_O(fun, O_BOUND_METH)) {
         py_obj_base_t *o = fun;
         DEBUG_OP_printf("calling bound method %p(self=%p, n_args=%d)\n", o->u_bound_meth.meth, o->u_bound_meth.self, n_args);
@@ -1132,9 +1211,7 @@ py_obj_t rt_call_function_n(py_obj_t fun, int n_args, const py_obj_t *args) {
     }
 
 bad_n_args:
-    printf("TypeError: function takes %d positional arguments but %d were given\n", n_args_fun, n_args);
-    assert(0);
-    return py_const_none;
+    nlr_jump(py_obj_new_exception_2(q_TypeError, "function takes %d positional arguments but %d were given", (const char*)(machine_int_t)n_args_fun, (const char*)(machine_int_t)n_args));
 }
 
 // args contains: arg(n_args-1)  arg(n_args-2)  ...  arg(0)  self/NULL  fun
@@ -1287,7 +1364,11 @@ no_attr:
 
 void rt_load_method(py_obj_t base, qstr attr, py_obj_t *dest) {
     DEBUG_OP_printf("load method %s\n", qstr_str(attr));
-    if (IS_O(base, O_LIST) && attr == q_append) {
+    if (IS_O(base, O_GEN_INSTANCE) && attr == q___next__) {
+        dest[1] = fun_gen_instance_next;
+        dest[0] = base;
+        return;
+    } else if (IS_O(base, O_LIST) && attr == q_append) {
         dest[1] = fun_list_append;
         dest[0] = base;
         return;
@@ -1354,7 +1435,9 @@ void rt_store_subscr(py_obj_t base, py_obj_t index, py_obj_t value) {
 }
 
 py_obj_t rt_getiter(py_obj_t o_in) {
-    if (IS_O(o_in, O_RANGE)) {
+    if (IS_O(o_in, O_GEN_INSTANCE)) {
+        return o_in;
+    } else if (IS_O(o_in, O_RANGE)) {
         py_obj_base_t *o = o_in;
         return py_obj_new_range_iterator(o->u_range.start, o->u_range.stop, o->u_range.step);
     } else if (IS_O(o_in, O_TUPLE)) {
@@ -1367,7 +1450,23 @@ py_obj_t rt_getiter(py_obj_t o_in) {
 }
 
 py_obj_t rt_iternext(py_obj_t o_in) {
-    if (IS_O(o_in, O_RANGE_IT)) {
+    if (IS_O(o_in, O_GEN_INSTANCE)) {
+        py_obj_base_t *self = o_in;
+        py_obj_base_t *fun = self->u_gen_instance.state[0];
+        assert(fun->kind == O_FUN_BC);
+        bool yield = py_execute_byte_code_2(fun->u_fun_bc.code, &self->u_gen_instance.ip, &self->u_gen_instance.state[1], &self->u_gen_instance.sp);
+        if (yield) {
+            return *self->u_gen_instance.sp;
+        } else {
+            if (*self->u_gen_instance.sp == py_const_none) {
+                return py_const_stop_iteration;
+            } else {
+                // TODO return StopIteration with value *self->u_gen_instance.sp
+                return py_const_stop_iteration;
+            }
+        }
+
+    } else if (IS_O(o_in, O_RANGE_IT)) {
         py_obj_base_t *o = o_in;
         if ((o->u_range_it.step > 0 && o->u_range_it.cur < o->u_range_it.stop) || (o->u_range_it.step < 0 && o->u_range_it.cur > o->u_range_it.stop)) {
             py_obj_t o_out = TO_SMALL_INT(o->u_range_it.cur);
@@ -1376,6 +1475,7 @@ py_obj_t rt_iternext(py_obj_t o_in) {
         } else {
             return py_const_stop_iteration;
         }
+
     } else if (IS_O(o_in, O_TUPLE_IT) || IS_O(o_in, O_LIST_IT)) {
         py_obj_base_t *o = o_in;
         if (o->u_tuple_list_it.cur < o->u_tuple_list_it.obj->u_tuple_list.len) {
@@ -1385,6 +1485,7 @@ py_obj_t rt_iternext(py_obj_t o_in) {
         } else {
             return py_const_stop_iteration;
         }
+
     } else {
         nlr_jump(py_obj_new_exception_2(q_TypeError, "? '%s' object is not iterable", py_obj_get_type_str(o_in), NULL));
     }
diff --git a/py/runtime.h b/py/runtime.h
index e9adbe1f0e0a8c4d661d3c370a546b89040f4ad6..7a806eb55a6faa28e11d0217e627da817092a94c 100644
--- a/py/runtime.h
+++ b/py/runtime.h
@@ -87,7 +87,7 @@ extern py_obj_t py_const_stop_iteration; // special object indicating end of ite
 void rt_init();
 void rt_deinit();
 int rt_get_new_unique_code_id();
-void rt_assign_byte_code(int unique_code_id, byte *code, uint len, int n_args);
+void rt_assign_byte_code(int unique_code_id, byte *code, uint len, int n_args, int n_locals, int n_stack, bool is_generator);
 void rt_assign_native_code(int unique_code_id, py_fun_t f, uint len, int n_args);
 void rt_assign_inline_asm_code(int unique_code_id, py_fun_t f, uint len, int n_args);
 py_fun_t rt_get_code(qstr id);
diff --git a/py/vm.c b/py/vm.c
index 2821d40470dd198042447003e237d60ea91d36f2..a4fbf2f16c610b75f452b9acb51e9e31423deabe 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -10,43 +10,50 @@
 #include "runtime.h"
 #include "bc.h"
 
+// (value) stack grows down (to be compatible with native code when passing pointers to the stack), top element is pointed to
+// exception stack grows up, top element is pointed to
+
 #define DECODE_UINT do { unum = *ip++; if (unum > 127) { unum = ((unum & 0x3f) << 8) | (*ip++); } } while (0)
 #define DECODE_QSTR do { qstr = *ip++; if (qstr > 127) { qstr = ((qstr & 0x3f) << 8) | (*ip++); } } while (0)
 #define PUSH(val) *--sp = (val)
 #define POP() (*sp++)
 
 // args are in reverse order in array
-py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args, uint n_args) {
+py_obj_t py_execute_byte_code(const byte *code, const py_obj_t *args, uint n_args) {
+    py_obj_t state[18]; // TODO allocate properly
+    // init args
+    for (int i = 0; i < n_args; i++) {
+        assert(i < 8);
+        state[i] = args[n_args - 1 - i];
+    }
+    py_obj_t *sp = &state[18];
+    const byte *ip = code;
+    if (py_execute_byte_code_2(code, &ip, &state[0], &sp)) {
+        // it shouldn't yield
+        assert(0);
+    }
+    assert(sp == &state[17]);
+    return *sp;
+}
+
+// fastn has items in normal order
+// sp points to top of stack which grows down
+bool py_execute_byte_code_2(const byte *code, const byte **ip_in_out, py_obj_t *fastn, py_obj_t **sp_in_out) {
     // careful: be sure to declare volatile any variables read in the exception handler (written is ok, I think)
 
-    const byte *ip = code;
-    py_obj_t stack[10];
-    py_obj_t *sp = &stack[10]; // stack grows down, sp points to top of stack
+    const byte *ip = *ip_in_out;
+    py_obj_t *sp = *sp_in_out;
     machine_uint_t unum;
     machine_int_t snum;
     qstr qstr;
     py_obj_t obj1, obj2;
-    py_obj_t fast0 = NULL, fast1 = NULL, fast2 = NULL, fastn[4] = {NULL, NULL, NULL, NULL};
+    py_obj_t fast0 = fastn[0], fast1 = fastn[1], fast2 = fastn[2];
     nlr_buf_t nlr;
 
     // on the exception stack we store (ip, sp) for each block
     machine_uint_t exc_stack[8];
     machine_uint_t *volatile exc_sp = &exc_stack[-1]; // stack grows up, exc_sp points to top of stack
 
-    // init args
-    for (int i = 0; i < n_args; i++) {
-        if (i == 0) {
-            fast0 = args[n_args - 1];
-        } else if (i == 1) {
-            fast1 = args[n_args - 2];
-        } else if (i == 2) {
-            fast2 = args[n_args - 3];
-        } else {
-            assert(i - 3 < 4);
-            fastn[i - 3] = args[n_args - 1 - i];
-        }
-    }
-
     // outer exception handling loop
     for (;;) {
         if (nlr_push(&nlr) == 0) {
@@ -99,7 +106,7 @@ py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args,
 
                     case PYBC_LOAD_FAST_N:
                         DECODE_UINT;
-                        PUSH(fastn[unum - 3]);
+                        PUSH(fastn[unum]);
                         break;
 
                     case PYBC_LOAD_NAME:
@@ -141,7 +148,7 @@ py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args,
 
                     case PYBC_STORE_FAST_N:
                         DECODE_UINT;
-                        fastn[unum - 3] = POP();
+                        fastn[unum] = POP();
                         break;
 
                     case PYBC_STORE_NAME:
@@ -251,7 +258,6 @@ py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args,
                         //exc_sp--; // discard ip
                         exc_sp -= 2;
                         //sp += 3; // pop 3 exception values
-                        assert(sp <= &stack[10]);
                         break;
 
                     case PYBC_BINARY_OP:
@@ -330,15 +336,24 @@ py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args,
 
                     case PYBC_RETURN_VALUE:
                         nlr_pop();
-                        assert(sp == &stack[9]);
+                        *sp_in_out = sp;
                         assert(exc_sp == &exc_stack[-1]);
-                        return *sp;
+                        return false;
+
+                    case PYBC_YIELD_VALUE:
+                        nlr_pop();
+                        *ip_in_out = ip;
+                        fastn[0] = fast0;
+                        fastn[1] = fast1;
+                        fastn[2] = fast2;
+                        *sp_in_out = sp;
+                        return true;
 
                     default:
                         printf("code %p, offset %u, byte code 0x%02x not implemented\n", code, (uint)(ip - code), op);
                         assert(0);
                         nlr_pop();
-                        return py_const_none;
+                        return false;
                 }
             }
 
@@ -355,6 +370,7 @@ py_obj_t py_execute_byte_code(const byte *code, uint len, const py_obj_t *args,
                 PUSH(py_const_none);
             } else {
                 // re-raise exception
+                // TODO what to do if this is a generator??
                 nlr_jump(nlr.ret_val);
             }
         }