diff --git a/minimal/Makefile b/minimal/Makefile
index f6e0ef14c0e24ffc6ab6e6896c03c5d838db654e..3206a21214cfd6ce209f3d97dcaca37a9107a11d 100644
--- a/minimal/Makefile
+++ b/minimal/Makefile
@@ -19,6 +19,8 @@ INC += -I../stmhal
 INC += -I$(BUILD)
 
 ifeq ($(CROSS), 1)
+DFU = ../tools/dfu.py
+PYDFU = ../tools/pydfu.py
 CFLAGS_CORTEX_M4 = -mthumb -mtune=cortex-m4 -mabi=aapcs-linux -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard -fsingle-precision-constant -Wdouble-promotion
 CFLAGS = $(INC) -Wall -Werror -ansi -std=gnu99 -nostdlib $(CFLAGS_CORTEX_M4) $(COPT)
 else
@@ -49,20 +51,28 @@ SRC_C = \
 	lib/libc/string0.c \
 	lib/mp-readline/readline.c \
 
-SRC_S = \
-
-#	startup_stm32f40xx.s \
-#	gchelper.s \
-
-OBJ = $(PY_O) $(addprefix $(BUILD)/, $(SRC_C:.c=.o) $(SRC_S:.s=.o))
+OBJ = $(PY_O) $(addprefix $(BUILD)/, $(SRC_C:.c=.o))
 
+ifeq ($(CROSS), 1)
+all: $(BUILD)/firmware.dfu
+else
 all: $(BUILD)/firmware.elf
+endif
 
 $(BUILD)/firmware.elf: $(OBJ)
 	$(ECHO) "LINK $@"
 	$(Q)$(LD) $(LDFLAGS) -o $@ $^ $(LIBS)
 	$(Q)$(SIZE) $@
 
+$(BUILD)/firmware.dfu: $(BUILD)/firmware.elf
+	$(ECHO) "Create $@"
+	$(Q)$(OBJCOPY) -O binary -j .isr_vector -j .text -j .data $^ $(BUILD)/firmware.bin
+	$(Q)$(PYTHON) $(DFU) -b 0x08000000:$(BUILD)/firmware.bin $@
+
+deploy: $(BUILD)/firmware.dfu
+	$(ECHO) "Writing $< to the board"
+	$(Q)$(PYTHON) $(PYDFU) -u $<
+
 # Run emulation build on a POSIX system with suitable terminal settings
 run:
 	stty raw opost -echo
diff --git a/minimal/README.md b/minimal/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c42e082efbbfa667c5ad45d2c12d76a6f10d34c
--- /dev/null
+++ b/minimal/README.md
@@ -0,0 +1,35 @@
+# The minimal port
+
+This port is intended to be a minimal MicroPython port that actually runs.
+It can run under Linux (or similar) and on any STM32F4xx MCU (eg the pyboard).
+
+## Building and running Linux version
+
+By default the port will be built for the host machine:
+
+    $ make
+
+To run a small test script do:
+
+    $ make run
+
+## Building for an STM32 MCU
+
+The Makefile has the ability to build for a Cortex-M CPU, and by default
+includes some start-up code for an STM32F4xx MCU and also enables a UART
+for communication.  To build:
+
+    $ make CROSS=1
+
+If you previously built the Linux version, you will need to first run
+`make clean` to get rid of incompatible object files.
+
+Building will produce the build/firmware.dfu file which can be programmed
+to an MCU using:
+
+    $ make CROSS=1 deploy
+
+This version of the build will work out-of-the-box on a pyboard (and
+anything similar), and will give you a MicroPython REPL on UART1 at 9600
+baud.  Pin PA13 will also be driven high, and this turns on the red LED on
+the pyboard.
diff --git a/minimal/main.c b/minimal/main.c
index cb3b6250855e57e456b86ec288bd41ee1cba950d..5e104e7e83296205b70e57f5703a9473cd9bbf42 100644
--- a/minimal/main.c
+++ b/minimal/main.c
@@ -94,6 +94,161 @@ void MP_WEAK __assert_func(const char *file, int line, const char *func, const c
 }
 #endif
 
-#if !MICROPY_MIN_USE_STDOUT
-void _start(void) {main(0, NULL);}
+#if MICROPY_MIN_USE_CORTEX_CPU
+
+// this is a minimal IRQ and reset framework for any Cortex-M CPU
+
+extern uint32_t _estack, _sidata, _sdata, _edata, _sbss, _ebss;
+
+void Reset_Handler(void) __attribute__((naked));
+void Reset_Handler(void) {
+    // set stack pointer
+    asm volatile ("ldr sp, =_estack");
+    // copy .data section from flash to RAM
+    for (uint32_t *src = &_sidata, *dest = &_sdata; dest < &_edata;) {
+        *dest++ = *src++;
+    }
+    // zero out .bss section
+    for (uint32_t *dest = &_sbss; dest < &_ebss;) {
+        *dest++ = 0;
+    }
+    // jump to board initialisation
+    void _start(void);
+    _start();
+}
+
+void Default_Handler(void) {
+    for (;;) {
+    }
+}
+
+uint32_t isr_vector[] __attribute__((section(".isr_vector"))) = {
+    (uint32_t)&_estack,
+    (uint32_t)&Reset_Handler,
+    (uint32_t)&Default_Handler, // NMI_Handler
+    (uint32_t)&Default_Handler, // HardFault_Handler
+    (uint32_t)&Default_Handler, // MemManage_Handler
+    (uint32_t)&Default_Handler, // BusFault_Handler
+    (uint32_t)&Default_Handler, // UsageFault_Handler
+    0,
+    0,
+    0,
+    0,
+    (uint32_t)&Default_Handler, // SVC_Handler
+    (uint32_t)&Default_Handler, // DebugMon_Handler
+    0,
+    (uint32_t)&Default_Handler, // PendSV_Handler
+    (uint32_t)&Default_Handler, // SysTick_Handler
+};
+
+void _start(void) {
+    // when we get here: stack is initialised, bss is clear, data is copied
+
+    // SCB->CCR: enable 8-byte stack alignment for IRQ handlers, in accord with EABI
+    *((volatile uint32_t*)0xe000ed14) |= 1 << 9;
+
+    // initialise the cpu and peripherals
+    #if MICROPY_MIN_USE_STM32_MCU
+    void stm32_init(void);
+    stm32_init();
+    #endif
+
+    // now that we have a basic system up and running we can call main
+    main(0, NULL);
+
+    // we must not return
+    for (;;) {
+    }
+}
+
+#endif
+
+#if MICROPY_MIN_USE_STM32_MCU
+
+// this is minimal set-up code for an STM32 MCU
+
+typedef struct {
+    volatile uint32_t CR;
+    volatile uint32_t PLLCFGR;
+    volatile uint32_t CFGR;
+    volatile uint32_t CIR;
+    uint32_t _1[8];
+    volatile uint32_t AHB1ENR;
+    volatile uint32_t AHB2ENR;
+    volatile uint32_t AHB3ENR;
+    uint32_t _2;
+    volatile uint32_t APB1ENR;
+    volatile uint32_t APB2ENR;
+} periph_rcc_t;
+
+typedef struct {
+    volatile uint32_t MODER;
+    volatile uint32_t OTYPER;
+    volatile uint32_t OSPEEDR;
+    volatile uint32_t PUPDR;
+    volatile uint32_t IDR;
+    volatile uint32_t ODR;
+    volatile uint16_t BSRRL;
+    volatile uint16_t BSRRH;
+    volatile uint32_t LCKR;
+    volatile uint32_t AFR[2];
+} periph_gpio_t;
+
+typedef struct {
+    volatile uint32_t SR;
+    volatile uint32_t DR;
+    volatile uint32_t BRR;
+    volatile uint32_t CR1;
+} periph_uart_t;
+
+#define USART1 ((periph_uart_t*) 0x40011000)
+#define GPIOA  ((periph_gpio_t*) 0x40020000)
+#define GPIOB  ((periph_gpio_t*) 0x40020400)
+#define RCC    ((periph_rcc_t*)  0x40023800)
+
+// simple GPIO interface
+#define GPIO_MODE_IN (0)
+#define GPIO_MODE_OUT (1)
+#define GPIO_MODE_ALT (2)
+#define GPIO_PULL_NONE (0)
+#define GPIO_PULL_UP (0)
+#define GPIO_PULL_DOWN (1)
+void gpio_init(periph_gpio_t *gpio, int pin, int mode, int pull, int alt) {
+    gpio->MODER = (gpio->MODER & ~(3 << (2 * pin))) | (mode << (2 * pin));
+    // OTYPER is left as default push-pull
+    // OSPEEDR is left as default low speed
+    gpio->PUPDR = (gpio->PUPDR & ~(3 << (2 * pin))) | (pull << (2 * pin));
+    gpio->AFR[pin >> 3] = (gpio->AFR[pin >> 3] & ~(15 << (4 * (pin & 7)))) | (alt << (4 * (pin & 7)));
+}
+#define gpio_get(gpio, pin) ((gpio->IDR >> (pin)) & 1)
+#define gpio_set(gpio, pin, value) do { gpio->ODR = (gpio->ODR & ~(1 << (pin))) | (value << pin); } while (0)
+#define gpio_low(gpio, pin) do { gpio->BSRRH = (1 << (pin)); } while (0)
+#define gpio_high(gpio, pin) do { gpio->BSRRL = (1 << (pin)); } while (0)
+
+void stm32_init(void) {
+    // basic MCU config
+    RCC->CR |= (uint32_t)0x00000001; // set HSION
+    RCC->CFGR = 0x00000000; // reset all
+    RCC->CR &= (uint32_t)0xfef6ffff; // reset HSEON, CSSON, PLLON
+    RCC->PLLCFGR = 0x24003010; // reset PLLCFGR
+    RCC->CR &= (uint32_t)0xfffbffff; // reset HSEBYP
+    RCC->CIR = 0x00000000; // disable IRQs
+
+    // leave the clock as-is (internal 16MHz)
+
+    // enable GPIO clocks
+    RCC->AHB1ENR |= 0x00000003; // GPIOAEN, GPIOBEN
+
+    // turn on an LED! (on pyboard it's the red one)
+    gpio_init(GPIOA, 13, GPIO_MODE_OUT, GPIO_PULL_NONE, 0);
+    gpio_high(GPIOA, 13);
+
+    // enable UART1 at 9600 baud (TX=B6, RX=B7)
+    gpio_init(GPIOB, 6, GPIO_MODE_ALT, GPIO_PULL_NONE, 7);
+    gpio_init(GPIOB, 7, GPIO_MODE_ALT, GPIO_PULL_NONE, 7);
+    RCC->APB2ENR |= 0x00000010; // USART1EN
+    USART1->BRR = (104 << 4) | 3; // 16MHz/(16*104.1875) = 9598 baud
+    USART1->CR1 = 0x0000200c; // USART enable, tx enable, rx enable
+}
+
 #endif
diff --git a/minimal/mpconfigport.h b/minimal/mpconfigport.h
index f24b5142627f7350f5871d9f6907a271a54cc67f..e055f3c8016a324f55a071a67e35e46235eea4ee 100644
--- a/minimal/mpconfigport.h
+++ b/minimal/mpconfigport.h
@@ -83,6 +83,11 @@ extern const struct _mp_obj_fun_builtin_t mp_builtin_open_obj;
 #define MICROPY_MIN_USE_STDOUT (1)
 #endif
 
+#ifdef __thumb__
+#define MICROPY_MIN_USE_CORTEX_CPU (1)
+#define MICROPY_MIN_USE_STM32_MCU (1)
+#endif
+
 #define MP_STATE_PORT MP_STATE_VM
 
 #define MICROPY_PORT_ROOT_POINTERS \
diff --git a/minimal/stm32f405.ld b/minimal/stm32f405.ld
index 345a92d3c1b7f4056a5e19563259aab246a9dd4e..b4aeda744e3885faab4a715a0657839353b413b0 100644
--- a/minimal/stm32f405.ld
+++ b/minimal/stm32f405.ld
@@ -6,8 +6,6 @@
 MEMORY
 {
     FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 0x100000 /* entire flash, 1 MiB */
-    FLASH_ISR (rx)  : ORIGIN = 0x08000000, LENGTH = 0x004000 /* sector 0, 16 KiB */
-    FLASH_TEXT (rx) : ORIGIN = 0x08020000, LENGTH = 0x080000 /* sectors 5,6,7,8, 4*128KiB = 512 KiB (could increase it more) */
     CCMRAM (xrw)    : ORIGIN = 0x10000000, LENGTH = 0x010000 /* 64 KiB */
     RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 0x020000 /* 128 KiB */
 }
@@ -15,52 +13,24 @@ MEMORY
 /* top end of the stack */
 _estack = ORIGIN(RAM) + LENGTH(RAM);
 
-/* RAM extents for the garbage collector */
-_ram_end = ORIGIN(RAM) + LENGTH(RAM);
-_heap_end = 0x2001c000; /* tunable */
-
 /* define output sections */
 SECTIONS
 {
-    /* The startup code goes first into FLASH */
-    .isr_vector :
-    {
-        . = ALIGN(4);
-        KEEP(*(.isr_vector)) /* Startup code */
-
-        . = ALIGN(4);
-    } >FLASH_ISR
-   
     /* The program code and other data goes into FLASH */
     .text :
     {
         . = ALIGN(4);
+        KEEP(*(.isr_vector)) /* isr vector table */
         *(.text)           /* .text sections (code) */
         *(.text*)          /* .text* sections (code) */
         *(.rodata)         /* .rodata sections (constants, strings, etc.) */
         *(.rodata*)        /* .rodata* sections (constants, strings, etc.) */
-    /*  *(.glue_7)   */    /* glue arm to thumb code */
-    /*  *(.glue_7t)  */    /* glue thumb to arm code */
 
         . = ALIGN(4);
         _etext = .;        /* define a global symbol at end of code */
         _sidata = _etext;  /* This is used by the startup in order to initialize the .data secion */
-    } >FLASH_TEXT
-     
-    /*
-    .ARM.extab :
-    {
-        *(.ARM.extab* .gnu.linkonce.armextab.*)
     } >FLASH
 
-    .ARM :
-    {
-        __exidx_start = .;
-        *(.ARM.exidx*)
-        __exidx_end = .;
-    } >FLASH
-    */
-        
     /* This is the initialized data section
     The program executes knowing that the data is in the RAM
     but the loader puts the initial values in the FLASH (inidata).
@@ -69,7 +39,6 @@ SECTIONS
     {
         . = ALIGN(4);
         _sdata = .;        /* create a global symbol at data start; used by startup code in order to initialise the .data section in RAM */
-        _ram_start = .;    /* create a global symbol at ram start for garbage collector */
         *(.data)           /* .data sections */
         *(.data*)          /* .data* sections */
 
@@ -90,28 +59,5 @@ SECTIONS
         _ebss = .;         /* define a global symbol at bss end; used by startup code */
     } >RAM
 
-    /* this is to define the start of the heap, and make sure we have a minimum size */
-    .heap :
-    {
-        . = ALIGN(4);
-        _heap_start = .;    /* define a global symbol at heap start */
-    } >RAM
-
-    /* this just checks there is enough RAM for the stack */
-    .stack :
-    {
-        . = ALIGN(4);
-    } >RAM
-
-    /* Remove information from the standard libraries */
-    /*
-    /DISCARD/ :
-    {
-        libc.a ( * )
-        libm.a ( * )
-        libgcc.a ( * )
-    }
-    */
-
     .ARM.attributes 0 : { *(.ARM.attributes) }
 }
diff --git a/minimal/uart_core.c b/minimal/uart_core.c
index 48cc27d6de07f3438cbaee446c0e7a878dc71f51..d2d17b4d141802b130eeb8d88b1ccd14580d2714 100644
--- a/minimal/uart_core.c
+++ b/minimal/uart_core.c
@@ -5,12 +5,25 @@
  * Core UART functions to implement for a port
  */
 
+#if MICROPY_MIN_USE_STM32_MCU
+typedef struct {
+    volatile uint32_t SR;
+    volatile uint32_t DR;
+} periph_uart_t;
+#define USART1 ((periph_uart_t*)0x40011000)
+#endif
+
 // Receive single character
 int mp_hal_stdin_rx_chr(void) {
     unsigned char c = 0;
 #if MICROPY_MIN_USE_STDOUT
     int r = read(0, &c, 1);
     (void)r;
+#elif MICROPY_MIN_USE_STM32_MCU
+    // wait for RXNE
+    while ((USART1->SR & (1 << 5)) == 0) {
+    }
+    c = USART1->DR;
 #endif
     return c;
 }
@@ -20,5 +33,12 @@ void mp_hal_stdout_tx_strn(const char *str, mp_uint_t len) {
 #if MICROPY_MIN_USE_STDOUT
     int r = write(1, str, len);
     (void)r;
+#elif MICROPY_MIN_USE_STM32_MCU
+    while (len--) {
+        // wait for TXE
+        while ((USART1->SR & (1 << 7)) == 0) {
+        }
+        USART1->DR = *str++;
+    }
 #endif
 }