diff --git a/stm/Makefile b/stm/Makefile
index 6199b89d048d59ecb30a0f28bf50c0b55bff2549..ece07af5301e22e2b964af9ab5d6d79127866960 100644
--- a/stm/Makefile
+++ b/stm/Makefile
@@ -29,6 +29,7 @@ SRC_C = \
 
 SRC_S = \
 	startup_stm32f40xx.s \
+	gchelper.s \
 
 PY_O = \
 	nlrthumb.o \
@@ -128,6 +129,10 @@ $(BUILD)/%.o: $(PYSRC)/%.c mpyconfig.h
 $(BUILD)/emitnthumb.o: $(PYSRC)/emitnative.c $(PYSRC)/emit.h
 	$(CC) $(CFLAGS) -DN_THUMB -c -o $@ $<
 
+# optimising gc for speed; 5ms down to 4ms
+$(BUILD)/gc.o: $(PYSRC)/gc.c
+	$(CC) $(CFLAGS) -O3 -c -o $@ $<
+
 # optimising vm for speed, adds only a small amount to code size but makes a huge difference to speed (20% faster)
 $(BUILD)/vm.o: $(PYSRC)/vm.c
 	$(CC) $(CFLAGS) -O3 -c -o $@ $<
diff --git a/stm/lexerstm.c b/stm/lexerstm.c
index 9757da09b57b2c1d164912cd2ef6bf534eb3e641..06ea04d1b977aa09126973de4527af271393e7ac 100644
--- a/stm/lexerstm.c
+++ b/stm/lexerstm.c
@@ -36,6 +36,9 @@ unichar file_buf_next_char(py_lexer_file_buf_t *fb) {
         } else {
             UINT n;
             f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
+            if (n == 0) {
+                return PY_LEXER_CHAR_EOF;
+            }
             fb->len = n;
             fb->pos = 0;
         }
diff --git a/stm/lib/usb_regs.h b/stm/lib/usb_regs.h
index 323e870560aa334f60a425300422b5a59c2df2a1..d3ea8d821a77225f7b42e867035f6a9e5c3cc1f7 100644
--- a/stm/lib/usb_regs.h
+++ b/stm/lib/usb_regs.h
@@ -64,7 +64,7 @@
 #define USB_OTG_DATA_FIFO_SIZE               0x1000
 
 
-#define USB_OTG_MAX_TX_FIFOS                 15
+#define USB_OTG_MAX_TX_FIFOS                 4 // XXX check we can make it this small!
 
 #define USB_OTG_HS_MAX_PACKET_SIZE           512
 #define USB_OTG_FS_MAX_PACKET_SIZE           64
diff --git a/stm/lib/usbd_conf.h b/stm/lib/usbd_conf.h
index 60243d2f6af14e5c1b5decce7c48efd42654ea3b..ad4cd73c85f2808958c4e0f4160c8be38fa7c9e9 100644
--- a/stm/lib/usbd_conf.h
+++ b/stm/lib/usbd_conf.h
@@ -18,6 +18,6 @@
 #define MSC_IN_EP                   0x83
 #define MSC_OUT_EP                  0x03
 #define MSC_MAX_PACKET              64
-#define MSC_MEDIA_PACKET            4096
+#define MSC_MEDIA_PACKET            2048 /* XXX was 4096; how small can we make it? */
 
 #endif //__USBD_CONF__H__
diff --git a/stm/main.c b/stm/main.c
index d49c2398b53a7a6475905f7a22efe66bdcd02e2c..054f29b5f42eb35ed9938cc35819bd4804bf830c 100644
--- a/stm/main.c
+++ b/stm/main.c
@@ -217,6 +217,16 @@ char *readline(const char *prompt) {
     return NULL;
 }
 
+/*
+void gc_print_info() {
+    gc_info_t info;
+    gc_info(&info);
+    printf("! %lu total\n", info.total);
+    printf("! %lu : %lu\n", info.used, info.free);
+    printf("! 1=%lu 2=%lu m=%lu\n", info.num_1block, info.num_2block, info.max_block);
+}
+*/
+
 void do_repl() {
     usb_vcp_send_str("Micro Python\r\n");
 
@@ -266,12 +276,32 @@ void do_repl() {
     }
 }
 
+#define RAM_START (0x20000000) // fixed for chip
+#define HEAP_END  (0x2001c000) // tunable
+#define RAM_END   (0x20020000) // fixed for chip
+
+void gc_helper_get_regs_and_clean_stack(machine_uint_t *regs, machine_uint_t heap_end);
+
 void gc_collect() {
+    uint32_t start = sys_tick_counter;
     gc_collect_start();
-    gc_collect_root((void**)0x20000000, (((uint32_t)&_heap_start) - 0x20000000) / 4);
-    gc_collect_root((void**)(0x20000000 + 0x18000), (0x20000 - 0x18000) / 4);
-    // TODO registers
+    gc_collect_root((void**)RAM_START, (((uint32_t)&_heap_start) - RAM_START) / 4);
+    machine_uint_t regs[10];
+    gc_helper_get_regs_and_clean_stack(regs, HEAP_END);
+    gc_collect_root((void**)HEAP_END, (RAM_END - HEAP_END) / 4); // will trace regs since they now live in this function on the stack
     gc_collect_end();
+    uint32_t ticks = sys_tick_counter - start; // TODO implement a function that does this properly
+    gc_info_t info;
+    gc_info(&info);
+    printf("GC@%lu %lums\n", start, ticks);
+    printf(" %lu total\n", info.total);
+    printf(" %lu : %lu\n", info.used, info.free);
+    printf(" 1=%lu 2=%lu m=%lu\n", info.num_1block, info.num_2block, info.max_block);
+}
+
+py_obj_t py_gc_collect() {
+    gc_collect();
+    return py_const_none;
 }
 
 int main() {
@@ -296,14 +326,14 @@ int main() {
     storage_init();
 
     // GC init
-    gc_init(&_heap_start, (void*)(0x20000000 + 0x18000));
-    sys_tick_delay_ms(2000);
+    gc_init(&_heap_start, (void*)HEAP_END);
 
     // Python init
     qstr_init();
     rt_init();
 
     // add some functions to the python namespace
+    rt_store_name(qstr_from_str_static("gc"), rt_make_function_0(py_gc_collect));
     rt_store_name(qstr_from_str_static("pyb_delay"), rt_make_function_1(pyb_delay));
     rt_store_name(qstr_from_str_static("pyb_led"), rt_make_function_1(pyb_led));
     rt_store_name(qstr_from_str_static("pyb_sw"), rt_make_function_0(pyb_sw));
@@ -373,14 +403,28 @@ int main() {
 
     // run /boot.py
     if (0) {
-        FIL fp;
-        f_open(&fp, "0:/boot.py", FA_READ);
-        UINT n;
-        char buf[20];
-        f_read(&fp, buf, 18, &n);
-        buf[n + 1] = 0;
-        printf("read %d\n%s", n, buf);
-        f_close(&fp);
+        py_lexer_file_buf_t fb;
+        py_lexer_t *lex = py_lexer_new_from_file("0:/boot.py", &fb);
+        py_parse_node_t pn = py_parse(lex, PY_PARSE_FILE_INPUT);
+        py_lexer_free(lex);
+
+        if (pn != PY_PARSE_NODE_NULL) {
+            bool comp_ok = py_compile(pn, true);
+            if (comp_ok) {
+                py_obj_t module_fun = rt_make_function_from_id(1);
+                if (module_fun != py_const_none) {
+                    nlr_buf_t nlr;
+                    if (nlr_push(&nlr) == 0) {
+                        rt_call_function_0(module_fun);
+                        nlr_pop();
+                    } else {
+                        // uncaught exception
+                        py_obj_print((py_obj_t)nlr.ret_val);
+                        printf("\n");
+                    }
+                }
+            }
+        }
     }
 
     // turn boot-up LED off
@@ -425,6 +469,7 @@ int main() {
             "        x = x + 1\n"
             "f()\n";
             */
+            /*
             "print('in python!')\n"
             "x = 0\n"
             "while x < 4:\n"
@@ -436,6 +481,7 @@ int main() {
             "print('press me!')\n"
             "while True:\n"
             "    pyb_led(pyb_sw())\n";
+            */
             /*
             // impl16.py
             "@micropython.asm_thumb\n"
@@ -472,7 +518,6 @@ int main() {
             "except:\n"
             "    print(x)\n";
             */
-            /*
             // impl19.py
             "# for loop\n"
             "def f():\n"
@@ -481,7 +526,6 @@ int main() {
             "            for z in range(400):\n"
             "                pass\n"
             "f()\n";
-            */
 
         py_lexer_str_buf_t py_lexer_str_buf;
         py_lexer_t *lex = py_lexer_new_from_str_len("<stdin>", pysrc, strlen(pysrc), false, &py_lexer_str_buf);
diff --git a/stm/malloc0.c b/stm/malloc0.c
index 686dfbf4be6917ea63a83be7cd626afbe7ab31cf..8827e82a8c1df97aab3f062e7d143f5c7cc9908e 100644
--- a/stm/malloc0.c
+++ b/stm/malloc0.c
@@ -42,6 +42,7 @@ void *malloc(size_t n) {
 }
 
 void free(void *ptr) {
+    gc_free(ptr);
 }
 
 void *realloc(void *ptr, size_t n) {
diff --git a/stm/printf.c b/stm/printf.c
index 821b790b4b8de0c1b22fa41a543b920c427142f5..3ccdd7084ba813694e8cd284063162fe244ddb41 100644
--- a/stm/printf.c
+++ b/stm/printf.c
@@ -214,7 +214,7 @@ int pfenv_printf(const pfenv_t *pfenv, const char *fmt, va_list args) {
 void stdout_print_strn(void *data, const char *str, unsigned int len) {
     // send stdout to LCD and USB CDC VCP
     if (usb_vcp_is_enabled()) {
-        usb_vcp_send_strn(str, len);
+        usb_vcp_send_strn_cooked(str, len);
     } else {
         lcd_print_strn(str, len);
     }
diff --git a/stm/usb.c b/stm/usb.c
index 4e8b454f190c8f196fe32f35226888ee50330288..b4fadf26daf77035df498bcaaa26dc2211c43085 100644
--- a/stm/usb.c
+++ b/stm/usb.c
@@ -75,3 +75,25 @@ void usb_vcp_send_strn(const char *str, int len) {
         VCP_fops.pIf_DataTx((const uint8_t*)str, len);
     }
 }
+
+#include "lib/usbd_conf.h"
+
+/* These are external variables imported from CDC core to be used for IN 
+   transfer management. */
+extern uint8_t  APP_Rx_Buffer []; /* Write CDC received data in this buffer.
+                                     These data will be sent over USB IN endpoint
+                                     in the CDC core functions. */
+extern uint32_t APP_Rx_ptr_in;    /* Increment this pointer or roll it back to
+                                     start address when writing received data
+                                     in the buffer APP_Rx_Buffer. */
+
+void usb_vcp_send_strn_cooked(const char *str, int len) {
+    for (const char *top = str + len; str < top; str++) {
+        if (*str == '\n') {
+            APP_Rx_Buffer[APP_Rx_ptr_in] = '\r';
+            APP_Rx_ptr_in = (APP_Rx_ptr_in + 1) & (APP_RX_DATA_SIZE - 1);
+        }
+        APP_Rx_Buffer[APP_Rx_ptr_in] = *str;
+        APP_Rx_ptr_in = (APP_Rx_ptr_in + 1) & (APP_RX_DATA_SIZE - 1);
+    }
+}
diff --git a/stm/usb.h b/stm/usb.h
index 75b7bb3464fbd1c879922573db699d440f8b632d..da9d94306426a9abb2fcbed2a505e00574c36b88 100644
--- a/stm/usb.h
+++ b/stm/usb.h
@@ -4,3 +4,4 @@ int usb_vcp_rx_any();
 char usb_vcp_rx_get();
 void usb_vcp_send_str(const char* str);
 void usb_vcp_send_strn(const char* str, int len);
+void usb_vcp_send_strn_cooked(const char *str, int len);