diff --git a/py/gc.c b/py/gc.c
new file mode 100644
index 0000000000000000000000000000000000000000..b686822c4e147e76ccd4f4230a2c8ee90f7164e3
--- /dev/null
+++ b/py/gc.c
@@ -0,0 +1,332 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "mpyconfig.h"
+#include "gc.h"
+
+// a machine word is big enough to hold a pointer
+/*
+#define BYTES_PER_WORD (8)
+typedef unsigned long machine_uint_t;
+*/
+typedef unsigned char byte;
+
+#define BITS_PER_BYTE (8)
+#define BITS_PER_WORD (BITS_PER_BYTE * BYTES_PER_WORD)
+#define WORDS_PER_BLOCK (4)
+#define BYTES_PER_BLOCK (WORDS_PER_BLOCK * BYTES_PER_WORD)
+#define STACK_SIZE (64) // tunable; minimum is 1
+
+static byte *gc_alloc_table_start;
+static byte *gc_alloc_table_end;
+static machine_uint_t gc_alloc_table_byte_len;
+static machine_uint_t *gc_pool_start;
+static machine_uint_t *gc_pool_end;
+
+static int gc_stack_overflow;
+static machine_uint_t gc_stack[STACK_SIZE];
+static machine_uint_t *gc_sp;
+
+// TODO waste less memory; currently requires that all entries in alloc_table have a corresponding block in pool
+void gc_init(void *start, void *end) {
+    // align end pointer on block boundary
+    end = (void*)((machine_uint_t)end & (~(BYTES_PER_BLOCK - 1)));
+    machine_uint_t total_word_len = (machine_uint_t*)end - (machine_uint_t*)start;
+    gc_alloc_table_byte_len = total_word_len * BYTES_PER_WORD / (1 + BITS_PER_BYTE / 2 * BYTES_PER_BLOCK);
+    gc_alloc_table_start = (byte*)start;
+    gc_alloc_table_end = gc_alloc_table_start + gc_alloc_table_byte_len;
+    machine_uint_t gc_pool_block_len = gc_alloc_table_byte_len * BITS_PER_BYTE / 2;
+    machine_uint_t gc_pool_word_len = gc_pool_block_len * WORDS_PER_BLOCK;
+    gc_pool_start = (machine_uint_t*)end - gc_pool_word_len;
+    gc_pool_end = end;
+
+    /*
+    printf("GC layout:\n");
+    printf("  alloc table at %p, length %u bytes\n", gc_alloc_table_start, gc_alloc_table_byte_len);
+    printf("  pool at %p, length %u blocks = %u words = %u bytes\n", gc_pool_start, gc_pool_block_len, gc_pool_word_len, gc_pool_word_len * BYTES_PER_WORD);
+    */
+    printf("GC: %u bytes\n", gc_pool_word_len * BYTES_PER_WORD);
+}
+
+// ATB = allocation table byte
+// 0b00 = FREE -- free block
+// 0b01 = HEAD -- head of a chain of blocks
+// 0b10 = TAIL -- in the tail of a chain of blocks
+// 0b11 = MARK -- marked head block
+
+#define AT_FREE (0)
+#define AT_HEAD (1)
+#define AT_TAIL (2)
+#define AT_MARK (3)
+
+#define BLOCKS_PER_ATB (4)
+#define ATB_MASK_0 (0x03)
+#define ATB_MASK_1 (0x0c)
+#define ATB_MASK_2 (0x30)
+#define ATB_MASK_3 (0xc0)
+
+#define ATB_0_IS_FREE(a) (((a) & ATB_MASK_0) == 0)
+#define ATB_1_IS_FREE(a) (((a) & ATB_MASK_1) == 0)
+#define ATB_2_IS_FREE(a) (((a) & ATB_MASK_2) == 0)
+#define ATB_3_IS_FREE(a) (((a) & ATB_MASK_3) == 0)
+
+#define BLOCK_SHIFT(block) (2 * ((block) & (BLOCKS_PER_ATB - 1)))
+#define ATB_GET_KIND(block) ((gc_alloc_table_start[(block) / BLOCKS_PER_ATB] >> BLOCK_SHIFT(block)) & 3)
+#define ATB_ANY_TO_FREE(block) do { gc_alloc_table_start[(block) / BLOCKS_PER_ATB] &= (~(AT_MARK << BLOCK_SHIFT(block))); } while (0)
+#define ATB_FREE_TO_HEAD(block) do { gc_alloc_table_start[(block) / BLOCKS_PER_ATB] |= (AT_HEAD << BLOCK_SHIFT(block)); } while (0)
+#define ATB_FREE_TO_TAIL(block) do { gc_alloc_table_start[(block) / BLOCKS_PER_ATB] |= (AT_TAIL << BLOCK_SHIFT(block)); } while (0)
+#define ATB_HEAD_TO_MARK(block) do { gc_alloc_table_start[(block) / BLOCKS_PER_ATB] |= (AT_MARK << BLOCK_SHIFT(block)); } while (0)
+#define ATB_MARK_TO_HEAD(block) do { gc_alloc_table_start[(block) / BLOCKS_PER_ATB] &= (~(AT_TAIL << BLOCK_SHIFT(block))); } while (0)
+
+void gc_dump_at() {
+    for (machine_uint_t bl = 0; bl < gc_alloc_table_byte_len * BLOCKS_PER_ATB; bl++) {
+        printf("block % 6u ", bl);
+        switch (ATB_GET_KIND(bl)) {
+            case AT_FREE: printf("FREE"); break;
+            case AT_HEAD: printf("HEAD"); break;
+            case AT_TAIL: printf("TAIL"); break;
+            default: printf("MARK"); break;
+        }
+        printf("\n");
+    }
+}
+
+#define BLOCK_FROM_PTR(ptr) (((ptr) - (machine_uint_t)gc_pool_start) / BYTES_PER_BLOCK)
+#define PTR_FROM_BLOCK(block) (((block) * BYTES_PER_BLOCK + (machine_uint_t)gc_pool_start))
+#define ATB_FROM_BLOCK(bl) ((bl) / BLOCKS_PER_ATB)
+
+#define VERIFY_MARK_AND_PUSH(ptr) \
+    do { \
+        if ( \
+            (ptr & (BYTES_PER_BLOCK - 1)) == 0          /* must be aligned on a block */ \
+            && ptr >= (machine_uint_t)gc_pool_start     /* must be above start of pool */ \
+            && ptr < (machine_uint_t)gc_pool_end        /* must be below end of pool */ \
+           ) { \
+            machine_uint_t _block = BLOCK_FROM_PTR(ptr); \
+            if (ATB_GET_KIND(_block) == AT_HEAD) { \
+                /* an unmarked head, mark it, and push it on gc stack */ \
+                ATB_HEAD_TO_MARK(_block); \
+                if (gc_sp < &gc_stack[STACK_SIZE]) { \
+                    *gc_sp++ = _block; \
+                } else { \
+                    gc_stack_overflow = 1; \
+                } \
+            } \
+        } \
+    } while (0)
+
+static void gc_drain_stack() {
+    while (gc_sp > gc_stack) {
+        // pop the next block off the stack
+        machine_uint_t block = *--gc_sp;
+
+        // work out number of consecutive blocks in the chain starting with this on
+        machine_uint_t n_blocks = 0;
+        do {
+            n_blocks += 1;
+        } while (ATB_GET_KIND(block + n_blocks) == AT_TAIL);
+
+        // check this block's children
+        machine_uint_t *scan = (machine_uint_t*)PTR_FROM_BLOCK(block);
+        for (machine_uint_t i = n_blocks * WORDS_PER_BLOCK; i > 0; i--, scan++) {
+            machine_uint_t ptr2 = *scan;
+            VERIFY_MARK_AND_PUSH(ptr2);
+        }
+    }
+}
+
+static void gc_deal_with_stack_overflow() {
+    while (gc_stack_overflow) {
+        gc_stack_overflow = 0;
+        gc_sp = gc_stack;
+
+        // scan entire memory looking for blocks which have been marked but not their children
+        for (machine_uint_t block = 0; block < gc_alloc_table_byte_len * BLOCKS_PER_ATB; block++) {
+            // trace (again) if mark bit set
+            if (ATB_GET_KIND(block) == AT_MARK) {
+                *gc_sp++ = block;
+                gc_drain_stack();
+            }
+        }
+    }
+}
+
+static void gc_sweep() {
+    // free unmarked heads and their tails
+    int free_tail = 0;
+    for (machine_uint_t block = 0; block < gc_alloc_table_byte_len * BLOCKS_PER_ATB; block++) {
+        switch (ATB_GET_KIND(block)) {
+            case AT_HEAD:
+                free_tail = 1;
+                // fall through to free the head
+
+            case AT_TAIL:
+                if (free_tail) {
+                    ATB_ANY_TO_FREE(block);
+                }
+                break;
+
+            case AT_MARK:
+                ATB_MARK_TO_HEAD(block);
+                free_tail = 0;
+                break;
+        }
+    }
+}
+
+void gc_collect_start() {
+    gc_stack_overflow = 0;
+    gc_sp = gc_stack;
+}
+
+void gc_collect_root(void **ptrs, machine_uint_t len) {
+    for (machine_uint_t i = 0; i < len; i++) {
+        machine_uint_t ptr = (machine_uint_t)ptrs[i];
+        VERIFY_MARK_AND_PUSH(ptr);
+        gc_drain_stack();
+    }
+}
+
+void gc_collect_end() {
+    gc_deal_with_stack_overflow();
+    gc_sweep();
+
+    machine_uint_t n_free = 0;
+    machine_uint_t n_used = 0;
+    for (machine_uint_t block = 0; block < gc_alloc_table_byte_len * BLOCKS_PER_ATB; block++) {
+        switch (ATB_GET_KIND(block)) {
+            case AT_FREE:
+                n_free += 1;
+                break;
+
+            case AT_HEAD:
+            case AT_TAIL:
+                n_used += 1;
+                break;
+
+            case AT_MARK:
+                break;
+        }
+    }
+
+    printf("GC %u/%u\n", n_used * BYTES_PER_BLOCK, (n_free + n_used) * BYTES_PER_BLOCK);
+}
+
+void *gc_alloc(machine_uint_t n_bytes) {
+    machine_uint_t n_blocks = ((n_bytes + BYTES_PER_BLOCK - 1) & (~(BYTES_PER_BLOCK - 1))) / BYTES_PER_BLOCK;
+    //printf("gc_alloc(%u bytes -> %u blocks)\n", n_bytes, n_blocks);
+
+    // check for 0 allocation
+    if (n_blocks == 0) {
+        return NULL;
+    }
+
+    machine_uint_t i;
+    machine_uint_t end_block;
+    machine_uint_t start_block;
+    machine_uint_t n_free = 0;
+    int collected = 0;
+    for (;;) {
+
+        // look for a run of n_blocks available blocks
+        for (i = 0; i < gc_alloc_table_byte_len; i++) {
+            byte a = gc_alloc_table_start[i];
+            if (ATB_0_IS_FREE(a)) { if (++n_free >= n_blocks) { i = i * BLOCKS_PER_ATB + 0; goto found; } } else { n_free = 0; }
+            if (ATB_1_IS_FREE(a)) { if (++n_free >= n_blocks) { i = i * BLOCKS_PER_ATB + 1; goto found; } } else { n_free = 0; }
+            if (ATB_2_IS_FREE(a)) { if (++n_free >= n_blocks) { i = i * BLOCKS_PER_ATB + 2; goto found; } } else { n_free = 0; }
+            if (ATB_3_IS_FREE(a)) { if (++n_free >= n_blocks) { i = i * BLOCKS_PER_ATB + 3; goto found; } } else { n_free = 0; }
+        }
+
+        // nothing found!
+        if (collected) {
+            return NULL;
+        }
+        gc_collect();
+        collected = 1;
+    }
+
+    // found, ending at block i inclusive
+found:
+    // get starting and end blocks, both inclusive
+    end_block = i;
+    start_block = i - n_free + 1;
+
+    // mark first block as used head
+    ATB_FREE_TO_HEAD(start_block);
+
+    // mark rest of blocks as used tail
+    // TODO for a run of many blocks can make this more efficient
+    for (machine_uint_t bl = start_block + 1; bl <= end_block; bl++) {
+        ATB_FREE_TO_TAIL(bl);
+    }
+
+    // return pointer to first block
+    return (void*)(gc_pool_start + start_block * WORDS_PER_BLOCK);
+}
+
+machine_uint_t gc_nbytes(void *ptr_in) {
+    machine_uint_t ptr = (machine_uint_t)ptr_in;
+
+    if (
+        (ptr & (BYTES_PER_BLOCK - 1)) == 0          // must be aligned on a block
+        && ptr >= (machine_uint_t)gc_pool_start     // must be above start of pool
+        && ptr < (machine_uint_t)gc_pool_end        // must be below end of pool
+       ) {
+        machine_uint_t block = BLOCK_FROM_PTR(ptr);
+        if (ATB_GET_KIND(block) == AT_HEAD) {
+            // work out number of consecutive blocks in the chain starting with this on
+            machine_uint_t n_blocks = 0;
+            do {
+                n_blocks += 1;
+            } while (ATB_GET_KIND(block + n_blocks) == AT_TAIL);
+            return n_blocks * BYTES_PER_BLOCK;
+        }
+    }
+
+    // invalid pointer
+    return 0;
+}
+
+void *gc_realloc(void *ptr, machine_uint_t n_bytes) {
+    machine_uint_t n_existing = gc_nbytes(ptr);
+    if (n_bytes <= n_existing) {
+        return ptr;
+    } else {
+        void *ptr2 = gc_alloc(n_bytes);
+        memcpy(ptr2, ptr, n_existing);
+        return ptr2;
+    }
+}
+
+/*
+int main() {
+    machine_uint_t len = 1000;
+    machine_uint_t *heap = malloc(len);
+    gc_init(heap, heap + len / sizeof(machine_uint_t));
+    void *ptrs[100];
+    {
+        machine_uint_t *p = gc_alloc(16);
+        p[0] = gc_alloc(64);
+        p[1] = gc_alloc(1);
+        p[2] = gc_alloc(1);
+        p[3] = gc_alloc(1);
+        machine_uint_t *p2 = gc_alloc(16);
+        p2[0] = p;
+        p2[1] = p;
+        ptrs[0] = p2;
+    }
+    for (int i = 0; i < 50; i+=2) {
+        machine_uint_t *p = gc_alloc(i);
+        printf("p=%p\n", p);
+        if (i & 3) {
+            //ptrs[i] = p;
+        }
+    }
+
+    gc_dump_at();
+    gc_collect(ptrs, sizeof(ptrs) / sizeof(void*));
+    gc_dump_at();
+}
+*/
diff --git a/py/gc.h b/py/gc.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8bb807808db1f90998845356ef051a2bdaafa22
--- /dev/null
+++ b/py/gc.h
@@ -0,0 +1,8 @@
+void gc_init(void *start, void *end);
+void gc_collect_start();
+void gc_collect_root(void **ptrs, machine_uint_t len);
+void gc_collect_end();
+void gc_collect();
+void *gc_alloc(machine_uint_t n_bytes);
+machine_uint_t gc_nbytes(void *ptr_in);
+void *gc_realloc(void *ptr, machine_uint_t n_bytes);