diff --git a/components/ctx/ctx.c b/components/ctx/ctx.c
index 8738b29030019035b6f58e47260d56073ea10639..f21e2e7d512af9f113cdfe3e8d8681745fd49684 100644
--- a/components/ctx/ctx.c
+++ b/components/ctx/ctx.c
@@ -10,55 +10,3 @@
 
 #define CTX_IMPLEMENTATION
 #include "ctx.h"
-
-#define FB_WIDTH  240
-#define FB_HEIGHT 240
-
-void st3m_ctx_merge_osd(uint16_t *restrict fb,
-                        const uint8_t *osd, int ostride,
-                        uint16_t *restrict osd_backup,
-                        int x0, int y0, int w, int h)
-{
-  uint8_t rgba[4]={0,0,0,255};
-  uint32_t *rgba_32 = (uint32_t*)&rgba[0];
-  for (int scanline = y0; scanline < y0 + h; scanline++)
-  {
-     uint16_t *fb_p = &fb[scanline * 240 + x0];
-     uint32_t *osd_p = (uint32_t*)&osd[(scanline-y0) * ostride];
-     uint16_t *backup_p = &osd_backup[(scanline-y0) * w];
-     uint32_t *ddst = (uint32_t*)&rgba[0];
-     
-     for (int x = 0; x < w; x++)
-     {
-       *backup_p = *fb_p;
-       *rgba_32 = ctx_565_unpack_32 (*fb_p, 1);
-       uint32_t si_ga = ((*osd_p) & 0xff00ff00) >> 8;
-       uint32_t si_rb = (*osd_p) & 0x00ff00ff;
-       uint32_t si_a  = si_ga >> 16;
-       uint32_t racov = si_a^255;
-      *(ddst) =
-     (((si_rb*255+0xff00ff+(((*ddst)&0x00ff00ff)*racov))>>8)&0x00ff00ff)|
-     ((si_ga*255+0xff00ff+((((*ddst)&0xff00ff00)>>8)*racov))&0xff00ff00);
-       *fb_p = ctx_565_pack(rgba[0], rgba[1], rgba[2], 1);
-       fb_p++;
-       osd_p++;
-       backup_p++;
-     }
-  }
-}
-
-void st3m_ctx_unmerge_osd(uint16_t *restrict fb, const uint16_t *osd_backup,
-                          int x0, int y0, int w, int h)
-{
-  for (int scanline = y0; scanline < y0 + h; scanline++)
-  {
-     uint16_t *fb_p = &fb[scanline * 240 + x0];
-     const uint16_t *backup_p = &osd_backup[(scanline-y0) * w];
-     for (int x = 0; x < w; x++)
-     {
-       *fb_p = *backup_p;
-       fb_p++;
-       backup_p++;
-     }
-  }
-}
diff --git a/components/ctx/ctx_config.h b/components/ctx/ctx_config.h
index bdb597052d783377dd64f7680840b27100e81ff3..2da210588b2300b20109b059ee869544542da622 100644
--- a/components/ctx/ctx_config.h
+++ b/components/ctx/ctx_config.h
@@ -50,7 +50,7 @@
 #define CTX_SHAPE_CACHE         0
 #define CTX_SHAPE_CACHE_DEFAULT 0
 #define CTX_RASTERIZER_MAX_CIRCLE_SEGMENTS 64
-#define CTX_NATIVE_GRAYA8       0
+#define CTX_NATIVE_GRAYA8       0 // set this to 1 for faster 8bpp mode - with some glitching
 #define CTX_ENABLE_SHADOW_BLUR  0
 #define CTX_FONTS_FROM_FILE     0
 #define CTX_MAX_KEYDB          16
diff --git a/components/flow3r_bsp/flow3r_bsp.h b/components/flow3r_bsp/flow3r_bsp.h
index a90b2f27bd4fe9884892b26e816094b601fa0265..e5e84f6c072fec73ed97b946d46e31cc9c499342 100644
--- a/components/flow3r_bsp/flow3r_bsp.h
+++ b/components/flow3r_bsp/flow3r_bsp.h
@@ -25,6 +25,10 @@ void flow3r_bsp_display_init(void);
 // performed simultaneously.
 void flow3r_bsp_display_send_fb(void *fb_data, int bits);
 
+void flow3r_bsp_display_send_fb_osd(void *fb_data, int bits, void *osd_data,
+                                    int osd_x0, int osd_y0, int osd_x1,
+                                    int osd_y1);
+
 // Set display backlight, as integer percent value (from 0 to 100, clamped).
 // No-op if display hasn't been succesfully initialized.
 void flow3r_bsp_display_set_backlight(uint8_t percent);
diff --git a/components/flow3r_bsp/flow3r_bsp_display.c b/components/flow3r_bsp/flow3r_bsp_display.c
index 84d22fabd0ec5c122246f7858633fdc96a6cf9ba..38983e1189ea86571f5481edc6c3cd4d99872ea0 100644
--- a/components/flow3r_bsp/flow3r_bsp_display.c
+++ b/components/flow3r_bsp/flow3r_bsp_display.c
@@ -36,13 +36,16 @@ void flow3r_bsp_display_init(void) {
     }
 }
 
-void flow3r_bsp_display_send_fb(void *fb_data, int bits) {
+void flow3r_bsp_display_send_fb_osd(void *fb_data, int bits, void *osd_data,
+                                    int osd_x0, int osd_y0, int osd_x1,
+                                    int osd_y1) {
     if (!gc9a01_initialized) {
         return;
     }
     static bool had_error = false;
 
-    esp_err_t ret = flow3r_bsp_gc9a01_blit_full(&gc9a01, fb_data, bits);
+    esp_err_t ret = flow3r_bsp_gc9a01_blit_osd(&gc9a01, fb_data, bits, osd_data,
+                                               osd_x0, osd_y0, osd_x1, osd_y1);
     if (ret != ESP_OK) {
         if (!had_error) {
             ESP_LOGE(TAG, "display blit failed: %s", esp_err_to_name(ret));
@@ -56,6 +59,10 @@ void flow3r_bsp_display_send_fb(void *fb_data, int bits) {
     }
 }
 
+void flow3r_bsp_display_send_fb(void *fb_data, int bits) {
+    flow3r_bsp_display_send_fb_osd(fb_data, bits, NULL, 0, 0, 0, 0);
+}
+
 void flow3r_bsp_display_set_backlight(uint8_t percent) {
     if (!gc9a01_initialized) {
         return;
diff --git a/components/flow3r_bsp/flow3r_bsp_gc9a01.c b/components/flow3r_bsp/flow3r_bsp_gc9a01.c
index c411526bda624c290d9d3eb3dd446812664922cb..1ad9ed215f72d9cc2ca166ae7ede4e8ce66abc23 100644
--- a/components/flow3r_bsp/flow3r_bsp_gc9a01.c
+++ b/components/flow3r_bsp/flow3r_bsp_gc9a01.c
@@ -99,6 +99,11 @@ typedef struct {
 typedef struct {
     flow3r_bsp_gc9a01_t *gc9a01;
     const uint8_t *fb;
+    const uint8_t *osd_fb;
+    int osd_x0;
+    int osd_y0;
+    int osd_x1;
+    int osd_y1;
     uint16_t *pal_16;
     int bits;
     size_t left;
@@ -570,6 +575,27 @@ static inline uint8_t ctx_sadd8(uint8_t a, uint8_t b) {
 }
 
 static EXT_RAM_BSS_ATTR uint16_t temp_blit[SPI_MAX_DMA_LEN / 2];
+
+static inline uint32_t ctx_565_unpack_32(const uint16_t pixel,
+                                         const int byteswap) {
+    uint16_t byteswapped;
+    if (byteswap) {
+        byteswapped = (pixel >> 8) | (pixel << 8);
+    } else {
+        byteswapped = pixel;
+    }
+    uint32_t b = (byteswapped & 31) << 3;
+    uint32_t g = ((byteswapped >> 5) & 63) << 2;
+    uint32_t r = ((byteswapped >> 11) & 31) << 3;
+#if 0
+  b = (b > 248) * 255 + (b <= 248) * b;
+  g = (g > 248) * 255 + (g <= 248) * g;
+  r = (r > 248) * 255 + (r <= 248) * r;
+#endif
+
+    return r + (g << 8) + (b << 16) + (0xff << 24);
+}
+
 static inline uint16_t ctx_565_pack(uint8_t red, uint8_t green, uint8_t blue,
                                     const int byteswap) {
 #if 0
@@ -590,17 +616,117 @@ static inline uint16_t ctx_565_pack(uint8_t red, uint8_t green, uint8_t blue,
     return c;
 }
 
+#define U8_LERP(a, b, t) ((a) + ((((b) - (a)) * t + 256) >> 8))
+
 static void flow3r_bsp_prep_blit(flow3r_bsp_gc9a01_blit_t *blit,
                                  int pix_count) {
-    // TODO: pixel-doubling
-    //       overlay-compositing
+    int scale = 1;
     const uint8_t *fb = blit->fb;
+    const uint8_t *osd_fb = blit->osd_fb;
     unsigned int start_off = blit->off;
     unsigned int end_off = start_off + pix_count;
     unsigned int o = 0;
-    switch (blit->bits) {
-        case 16:
-            break;
+
+    if (scale > 1) {
+        /* XXX : this code has room for optimization */
+        if (osd_fb && (start_off < blit->osd_y1 * 240)) switch (blit->bits) {
+                case 8:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        temp_blit[o++] = blit->pal_16[U8_LERP(
+                            fb[j], osd_fb[i * 4 + 1], osd_fb[i * 4 + 3])];
+                    }
+                    break;
+                case 24:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        uint8_t Y = osd_fb[i * 2];
+                        uint8_t ya = osd_fb[i * 2 + 1];
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        uint8_t r = U8_LERP(fb[j * 3 + 0], Y, ya);
+                        uint8_t g = U8_LERP(fb[j * 3 + 1], Y, ya);
+                        uint8_t b = U8_LERP(fb[j * 3 + 2], Y, ya);
+                        temp_blit[o++] = ctx_565_pack(r, g, b, 1);
+                    }
+                    break;
+                case 32:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        uint8_t Y = osd_fb[i * 2];
+                        uint8_t Ya = osd_fb[i * 2 + 1];
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        uint8_t r = U8_LERP(fb[j * 4 + 0], Y, Ya);
+                        uint8_t g = U8_LERP(fb[j * 4 + 1], Y, Ya);
+                        uint8_t b = U8_LERP(fb[j * 4 + 2], Y, Ya);
+                        temp_blit[o++] = ctx_565_pack(r, g, b, 1);
+                    }
+                    break;
+                case 16:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        uint32_t col =
+                            ctx_565_unpack_32(((uint16_t *)fb)[j], 1);
+                        uint8_t *rgba = (uint8_t *)&col;
+                        uint8_t sr = osd_fb[i * 4];
+                        uint8_t sg = osd_fb[i * 4 + 1];
+                        uint8_t sb = osd_fb[i * 4 + 2];
+                        uint8_t sa = osd_fb[i * 4 + 3];
+                        uint8_t dr = U8_LERP(rgba[0], sr, sa);
+                        uint8_t dg = U8_LERP(rgba[1], sg, sa);
+                        uint8_t db = U8_LERP(rgba[2], sb, sa);
+                        temp_blit[o++] = ctx_565_pack(dr, dg, db, 1);
+                    }
+                    break;
+            }
+        else {
+            switch (blit->bits) {
+                case 16:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        temp_blit[o++] = ((uint16_t *)fb)[j];
+                    }
+                    break;
+                case 8:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        temp_blit[o++] = blit->pal_16[fb[j]];
+                    }
+                    break;
+                case 24:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        temp_blit[o++] = ctx_565_pack(
+                            fb[j * 3 + 0], fb[j * 3 + 1], fb[j * 3 + 2], 1);
+                    }
+                    break;
+                case 32:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        int x = i % 240;
+                        int y = i / 240;
+                        int j = ((y / scale) * 240) + x / scale;
+                        temp_blit[o++] = ctx_565_pack(
+                            fb[j * 4 + 0], fb[j * 4 + 1], fb[j * 4 + 2], 1);
+                    }
+                    break;
+            }
+            // blit->osd_fb = NULL;
+            // blit->fb += ((end_off*blit->bits)/8);
+        }
+    } else {
+        if (osd_fb && (start_off < blit->osd_y1 * 240)) switch (blit->bits) {
+#if 0
         case 1:
             for (unsigned int i = 0; i < pix_count; i++)
                 temp_blit[o++] = blit->pal_16[(fb[i / 8] >> ((i & 7))) & 0x1];
@@ -610,26 +736,97 @@ static void flow3r_bsp_prep_blit(flow3r_bsp_gc9a01_blit_t *blit,
                 temp_blit[o++] =
                     blit->pal_16[(fb[i / 4] >> ((i & 3) * 2)) & 0x3];
             break;
-        case 4:
-            for (unsigned int i = start_off; i < end_off; i++) {
-                temp_blit[o++] =
-                    blit->pal_16[(fb[i / 2] >> ((i & 1) * 4)) & 0xf];
+#endif
+                case 4:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        temp_blit[o++] =
+                            blit->pal_16[(fb[i / 2] >> ((i & 1) * 4)) & 0xf];
+                    }
+                    break;
+                case 8:
+                    for (unsigned int i = start_off; i < end_off; i++)
+                        temp_blit[o++] = blit->pal_16[U8_LERP(
+                            fb[i], osd_fb[i * 4 + 1], osd_fb[i * 4 + 3])];
+                    break;
+                case 24:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        uint8_t y = osd_fb[i * 2];
+                        uint8_t ya = osd_fb[i * 2 + 1];
+                        uint8_t r = U8_LERP(fb[i * 3 + 0], y, ya);
+                        uint8_t g = U8_LERP(fb[i * 3 + 1], y, ya);
+                        uint8_t b = U8_LERP(fb[i * 3 + 2], y, ya);
+                        temp_blit[o++] = ctx_565_pack(r, g, b, 1);
+                    }
+                    break;
+                case 32:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        uint8_t y = osd_fb[i * 2];
+                        uint8_t ya = osd_fb[i * 2 + 1];
+                        uint8_t r = U8_LERP(fb[i * 4 + 0], y, ya);
+                        uint8_t g = U8_LERP(fb[i * 4 + 1], y, ya);
+                        uint8_t b = U8_LERP(fb[i * 4 + 2], y, ya);
+                        temp_blit[o++] = ctx_565_pack(r, g, b, 1);
+                    }
+                    break;
+                case 16:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        uint32_t col =
+                            ctx_565_unpack_32(((uint16_t *)fb)[i], 1);
+                        uint8_t *rgba = (uint8_t *)&col;
+                        uint8_t sr = osd_fb[i * 4];
+                        uint8_t sg = osd_fb[i * 4 + 1];
+                        uint8_t sb = osd_fb[i * 4 + 2];
+                        uint8_t sa = osd_fb[i * 4 + 3];
+                        uint8_t dr = U8_LERP(rgba[0], sr, sa);
+                        uint8_t dg = U8_LERP(rgba[1], sg, sa);
+                        uint8_t db = U8_LERP(rgba[2], sb, sa);
+                        temp_blit[o++] = ctx_565_pack(dr, dg, db, 1);
+                    }
+                    break;
             }
-            break;
-        case 8:
-            for (unsigned int i = start_off; i < end_off; i++)
-                temp_blit[o++] = blit->pal_16[fb[i]];
-            break;
-        case 24:
-            for (unsigned int i = start_off; i < end_off; i++)
-                temp_blit[o++] = ctx_565_pack(fb[i * 3 + 0], fb[i * 3 + 1],
-                                              fb[i * 3 + 2], 1);
-            break;
-        case 32:
-            for (unsigned int i = start_off; i < end_off; i++)
-                temp_blit[o++] = ctx_565_pack(fb[i * 4 + 0], fb[i * 4 + 1],
-                                              fb[i * 4 + 2], 1);
-            break;
+        else {
+            switch (blit->bits) {
+                case 16:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        temp_blit[o++] = ((uint16_t *)fb)[i];
+                    }
+                    blit->osd_fb = NULL;
+                    blit->fb += (end_off * 2);
+                    break;
+                case 1:
+                    for (unsigned int i = 0; i < pix_count; i++)
+                        temp_blit[o++] =
+                            blit->pal_16[(fb[i / 8] >> ((i & 7))) & 0x1];
+                    break;
+                case 2:
+                    for (unsigned int i = 0; i < pix_count; i++)
+                        temp_blit[o++] =
+                            blit->pal_16[(fb[i / 4] >> ((i & 3) * 2)) & 0x3];
+                    break;
+                case 4:
+                    for (unsigned int i = start_off; i < end_off; i++) {
+                        temp_blit[o++] =
+                            blit->pal_16[(fb[i / 2] >> ((i & 1) * 4)) & 0xf];
+                    }
+                    break;
+                case 8:
+                    for (unsigned int i = start_off; i < end_off; i++)
+                        temp_blit[o++] = blit->pal_16[fb[i]];
+                    break;
+                case 24:
+                    for (unsigned int i = start_off; i < end_off; i++)
+                        temp_blit[o++] = ctx_565_pack(
+                            fb[i * 3 + 0], fb[i * 3 + 1], fb[i * 3 + 2], 1);
+                    break;
+                case 32:
+                    for (unsigned int i = start_off; i < end_off; i++)
+                        temp_blit[o++] = ctx_565_pack(
+                            fb[i * 4 + 0], fb[i * 4 + 1], fb[i * 4 + 2], 1);
+                    break;
+            }
+            // blit->osd_fb = NULL;
+            // blit->fb += ((end_off*blit->bits)/8);
+        }
     }
     blit->off += pix_count;
 }
@@ -650,7 +847,7 @@ static esp_err_t flow3r_bsp_gc9a01_blit_next(flow3r_bsp_gc9a01_blit_t *blit) {
     memset(&blit->spi_tx, 0, sizeof(spi_transaction_t));
     blit->spi_tx.length = pix_count * 16;
 
-    if (blit->bits == 16) {
+    if ((blit->bits == 16) && (!blit->osd_fb)) {
         blit->spi_tx.tx_buffer = blit->fb;
         blit->fb += osize;
     } else {
@@ -677,15 +874,23 @@ static esp_err_t flow3r_bsp_gc9a01_blit_next(flow3r_bsp_gc9a01_blit_t *blit) {
 
 static esp_err_t flow3r_bsp_gc9a01_blit_start(flow3r_bsp_gc9a01_t *gc9a01,
                                               flow3r_bsp_gc9a01_blit_t *blit,
-                                              const uint16_t *fb, int bits) {
+                                              const uint16_t *fb, int bits,
+                                              const void *osd_fb, int osd_x0,
+                                              int osd_y0, int osd_x1,
+                                              int osd_y1) {
     memset(blit, 0, sizeof(flow3r_bsp_gc9a01_blit_t));
 
     blit->gc9a01 = gc9a01;
     blit->fb = (const uint8_t *)fb;
     blit->bits = bits;
+    blit->osd_fb = (const uint8_t *)osd_fb;
+    blit->osd_x0 = osd_x0;
+    blit->osd_x1 = osd_x1;
+    blit->osd_y0 = osd_y0;
+    blit->osd_y1 = osd_y1;
     blit->left = 2 * 240 * 240;  // left in native bytes (16bpp)
     if (bits < 16) {
-        uint8_t *pal_24 = ((uint8_t *)fb) + 240 * 240 * 4 - 3 * 256;
+        uint8_t *pal_24 = ((uint8_t *)fb) + 240 * 240 * 2 - 3 * 256;
         blit->pal_16 = (uint16_t *)(pal_24 - 256 * 2);
         for (int i = 0; i < 256; i++)
             blit->pal_16[i] = ctx_565_pack(pal_24[i * 3 + 0], pal_24[i * 3 + 1],
@@ -709,10 +914,13 @@ static esp_err_t flow3r_bsp_gc9a01_blit_wait_done(
     return ret;
 }
 
-esp_err_t flow3r_bsp_gc9a01_blit_full(flow3r_bsp_gc9a01_t *gc9a01,
-                                      const void *fb, int bits) {
+esp_err_t flow3r_bsp_gc9a01_blit_osd(flow3r_bsp_gc9a01_t *gc9a01,
+                                     const void *fb, int bits,
+                                     const void *osd_fb, int osd_x0, int osd_y0,
+                                     int osd_x1, int osd_y1) {
     flow3r_bsp_gc9a01_blit_t blit;
-    esp_err_t res = flow3r_bsp_gc9a01_blit_start(gc9a01, &blit, fb, bits);
+    esp_err_t res = flow3r_bsp_gc9a01_blit_start(
+        gc9a01, &blit, fb, bits, osd_fb, osd_x0, osd_y0, osd_x1, osd_y1);
     if (res != ESP_OK) {
         return res;
     }
@@ -729,3 +937,8 @@ esp_err_t flow3r_bsp_gc9a01_blit_full(flow3r_bsp_gc9a01_t *gc9a01,
     }
     return ESP_OK;
 }
+
+esp_err_t flow3r_bsp_gc9a01_blit_full(flow3r_bsp_gc9a01_t *gc9a01,
+                                      const void *fb, int bits) {
+    return flow3r_bsp_gc9a01_blit_osd(gc9a01, fb, bits, NULL, 0, 0, 0, 0);
+}
diff --git a/components/flow3r_bsp/flow3r_bsp_gc9a01.h b/components/flow3r_bsp/flow3r_bsp_gc9a01.h
index 981e17d4103cbb6f949ad09a1b5c4feab192280e..6ad1d631648f144b8a043ad9820eb0b6eef61e50 100644
--- a/components/flow3r_bsp/flow3r_bsp_gc9a01.h
+++ b/components/flow3r_bsp/flow3r_bsp_gc9a01.h
@@ -64,8 +64,15 @@ esp_err_t flow3r_bsp_gc9a01_init(flow3r_bsp_gc9a01_t *gc9a01,
 // This must not be called if another blit is being performed. The user code
 // should sequence access and make sure not more than one blit is performed
 // simultaneously.
+//
+// if overlay is provided we want it composited in, the pixel format of overlay
+// depends on bits - it is presumed to be the same size as fb.
 esp_err_t flow3r_bsp_gc9a01_blit_full(flow3r_bsp_gc9a01_t *gc9a01,
                                       const void *fb, int bits);
+esp_err_t flow3r_bsp_gc9a01_blit_osd(flow3r_bsp_gc9a01_t *gc9a01,
+                                     const void *fb, int bits,
+                                     const void *osd_fb, int osd_x0, int osd_y0,
+                                     int osd_x1, int osd_y1);
 
 // Set backlight for display, using integer percent value (0-100, clamped).
 esp_err_t flow3r_bsp_gc9a01_backlight_set(flow3r_bsp_gc9a01_t *gc9a01,
diff --git a/components/micropython/usermodule/mp_sys_display.c b/components/micropython/usermodule/mp_sys_display.c
index a27e68ed2ad707b2d206b34550afa0a40b50b0dc..61f981a9e5b24350b44d4e96a8448cbd368766f3 100644
--- a/components/micropython/usermodule/mp_sys_display.c
+++ b/components/micropython/usermodule/mp_sys_display.c
@@ -42,6 +42,12 @@ STATIC mp_obj_t mp_set_mode(mp_obj_t mode) {
 }
 STATIC MP_DEFINE_CONST_FUN_OBJ_1(mp_set_mode_obj, mp_set_mode);
 
+STATIC mp_obj_t mp_set_default_mode(mp_obj_t mode) {
+    st3m_gfx_set_default_mode(mp_obj_get_int(mode));
+    return mp_const_none;
+}
+STATIC MP_DEFINE_CONST_FUN_OBJ_1(mp_set_default_mode_obj, mp_set_default_mode);
+
 STATIC mp_obj_t mp_get_mode(void) {
     return mp_obj_new_int(st3m_gfx_get_mode());
 }
@@ -65,31 +71,13 @@ STATIC mp_obj_t mp_set_palette(mp_obj_t pal_in) {
 STATIC MP_DEFINE_CONST_FUN_OBJ_1(mp_set_palette_obj, mp_set_palette);
 
 STATIC mp_obj_t mp_ctx(mp_obj_t mode_in) {
-    return mp_ctx_from_ctx(st3m_ctx(mp_obj_get_int(mode_in)));
+    return mp_ctx_from_ctx(st3m_gfx_ctx(mp_obj_get_int(mode_in)));
 }
 STATIC MP_DEFINE_CONST_FUN_OBJ_1(mp_ctx_obj, mp_ctx);
 
 STATIC mp_obj_t mp_fb(mp_obj_t mode_in) {
     int mode = mp_obj_get_int(mode_in);
-    int size = 240 * 240;
-    switch (mode) {
-        case st3m_gfx_default:
-            size *= 2;
-            mode = 16;
-            break;
-        case st3m_gfx_16bpp:
-        case st3m_gfx_16bpp_osd:
-            size *= 2;
-            break;
-        case st3m_gfx_24bpp:
-            size *= 3;
-            break;
-        case st3m_gfx_osd:
-        case st3m_gfx_32bpp:
-        case st3m_gfx_32bpp_osd:
-            size *= 4;
-            break;
-    }
+    int size = 240 * 240 * st3m_gfx_bpp(mode) / 8;
     return mp_obj_new_bytearray_by_ref(size, st3m_gfx_fb(mode));
 }
 STATIC MP_DEFINE_CONST_FUN_OBJ_1(mp_fb_obj, mp_fb);
@@ -100,13 +88,13 @@ STATIC mp_obj_t mp_update(mp_obj_t ctx_in) {
         mp_raise_ValueError("not a ctx");
         return mp_const_none;
     }
-    st3m_ctx_end_frame(self->ctx);
+    st3m_gfx_end_frame(self->ctx);
     return mp_const_none;
 }
 STATIC MP_DEFINE_CONST_FUN_OBJ_1(mp_update_obj, mp_update);
 
 STATIC mp_obj_t mp_pipe_full(void) {
-    if (st3m_gfx_drawctx_pipe_full()) {
+    if (st3m_gfx_pipe_full()) {
         return mp_const_true;
     }
     return mp_const_false;
@@ -129,6 +117,8 @@ STATIC const mp_rom_map_elem_t mp_module_sys_display_globals_table[] = {
     { MP_ROM_QSTR(MP_QSTR_fps), MP_ROM_PTR(&mp_fps_obj) },
     { MP_ROM_QSTR(MP_QSTR_set_mode), MP_ROM_PTR(&mp_set_mode_obj) },
     { MP_ROM_QSTR(MP_QSTR_get_mode), MP_ROM_PTR(&mp_get_mode_obj) },
+    { MP_ROM_QSTR(MP_QSTR_set_default_mode),
+      MP_ROM_PTR(&mp_set_default_mode_obj) },
     { MP_ROM_QSTR(MP_QSTR_set_palette), MP_ROM_PTR(&mp_set_palette_obj) },
     { MP_ROM_QSTR(MP_QSTR_set_backlight), MP_ROM_PTR(&mp_set_backlight_obj) },
     { MP_ROM_QSTR(MP_QSTR_overlay_clip), MP_ROM_PTR(&mp_overlay_clip_obj) },
diff --git a/components/st3m/st3m_gfx.c b/components/st3m/st3m_gfx.c
index 96a6b01d2274d3cbbd926c962b721c5135246804..23bcfe09d7eba09f993b327a1ca3d097fa7caa41 100644
--- a/components/st3m/st3m_gfx.c
+++ b/components/st3m/st3m_gfx.c
@@ -20,9 +20,13 @@
 #include "st3m_counter.h"
 #include "st3m_version.h"
 
-#define ST3M_GFX_DEFAULT_MODE st3m_gfx_16bpp_osd
+#define ST3M_GFX_DEFAULT_MODE (16 | st3m_gfx_osd)
 
-static EXT_RAM_BSS_ATTR uint16_t fb[240 * 240];
+static st3m_gfx_mode default_mode = ST3M_GFX_DEFAULT_MODE;
+
+// if the EXT_RAM_BSS_ATTR is removed 8bit and 16bit modes go
+// faster but it is not possible to enable wifi
+EXT_RAM_BSS_ATTR static uint16_t st3m_fb[240 * 240];
 
 // Get a free drawlist ctx to draw into.
 //
@@ -32,7 +36,7 @@ static EXT_RAM_BSS_ATTR uint16_t fb[240 * 240];
 static Ctx *st3m_gfx_drawctx_free_get(TickType_t ticks_to_wait);
 
 // Submit a filled ctx descriptor to the rasterization pipeline.
-static void st3m_gfx_drawctx_pipe_put(void);
+static void st3m_gfx_pipe_put(void);
 
 static const char *TAG = "st3m-gfx";
 
@@ -49,22 +53,22 @@ static st3m_gfx_mode _st3m_gfx_mode = st3m_gfx_default + 1;
 
 EXT_RAM_BSS_ATTR static uint8_t
     st3m_osd_fb[ST3M_OSD_WIDTH * ST3M_OSD_HEIGHT * 4];
-EXT_RAM_BSS_ATTR uint16_t st3m_osd_backup[ST3M_OSD_WIDTH * ST3M_OSD_HEIGHT];
 
 // each frame buffer has an associated rasterizer context
 static Ctx *fb_GRAY8_ctx = NULL;
 static Ctx *fb_GRAYA8_ctx = NULL;
 static Ctx *fb_RGB565_BS_ctx = NULL;
 static Ctx *fb_RGBA8_ctx = NULL;
+static Ctx *fb_RGB8_ctx = NULL;
 
 // corner pixel coordinates for osd clip
 
-static pthread_mutex_t osd_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t st3m_osd_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-static int _st3m_osd_y1;
-static int _st3m_osd_x1;
-static int _st3m_osd_y0;
-static int _st3m_osd_x0;
+static int _st3m_osd_y1 = 0;
+static int _st3m_osd_x1 = 0;
+static int _st3m_osd_y0 = 0;
+static int _st3m_osd_x0 = 0;
 
 typedef struct {
     Ctx *user_ctx;
@@ -88,46 +92,54 @@ static int _st3m_gfx_low_latency = 0;
 
 ///////////////////////////////////////////////////////
 
-static Ctx *st3m_ctx_int(st3m_gfx_mode mode) {
+// get the bits per pixel for a given mode
+int st3m_gfx_bpp(st3m_gfx_mode mode) {
+    st3m_gfx_mode set_mode = _st3m_gfx_mode ? _st3m_gfx_mode : default_mode;
+    if (mode == st3m_gfx_default) {
+        mode = set_mode;
+    } else if (mode == st3m_gfx_osd) {
+        if ((st3m_gfx_bpp(set_mode) == 16) || (st3m_gfx_bpp(set_mode) == 8))
+            return 4;
+        else
+            return 2;
+    }
+    return (mode & (4 | 8 | 16 | 24 | 32));
+}
+
+static Ctx *st3m_gfx_ctx_int(st3m_gfx_mode mode) {
+    st3m_gfx_mode set_mode = _st3m_gfx_mode ? _st3m_gfx_mode : default_mode;
     if (mode == st3m_gfx_osd) {
-        st3m_gfx_mode set_mode =
-            _st3m_gfx_mode ? _st3m_gfx_mode : ST3M_GFX_DEFAULT_MODE;
-        switch (set_mode) {
-            case st3m_gfx_default:  // overriden
-            case st3m_gfx_low_latency:
-            case st3m_gfx_16bpp:
-            case st3m_gfx_16bpp_osd:
-            case st3m_gfx_16bpp_low_latency:
-                return fb_RGBA8_ctx;
-            case st3m_gfx_32bpp:
-            case st3m_gfx_32bpp_osd:
-            case st3m_gfx_32bpp_low_latency:
-            case st3m_gfx_24bpp:
-            case st3m_gfx_24bpp_low_latency:
-            case st3m_gfx_8bpp:
-            case st3m_gfx_8bpp_low_latency:
-            case st3m_gfx_8bpp_osd:
-            case st3m_gfx_osd:
-            case st3m_gfx_4bpp:
-                return fb_GRAYA8_ctx;
-        }
+        if ((st3m_gfx_bpp(set_mode) == 16) || (st3m_gfx_bpp(set_mode) == 8))
+            return fb_RGBA8_ctx;
+        return fb_GRAYA8_ctx;
     }
     Ctx *ctx = st3m_gfx_drawctx_free_get(1000);
 
+    if (set_mode & st3m_gfx_direct_ctx) switch (st3m_gfx_bpp(set_mode)) {
+            case 16:
+                return fb_RGB565_BS_ctx;
+            case 8:
+                return fb_GRAY8_ctx;
+            case 32:
+                return fb_RGBA8_ctx;
+            case 24:
+                return fb_RGB8_ctx;
+        }
+
     if (!ctx) {
         return NULL;
     }
     return ctx;
 }
-Ctx *st3m_ctx(st3m_gfx_mode mode) {
-    Ctx *ctx = st3m_ctx_int(mode);
+Ctx *st3m_gfx_ctx(st3m_gfx_mode mode) {
+    Ctx *ctx = st3m_gfx_ctx_int(mode);
     if (mode == st3m_gfx_osd) {
-        pthread_mutex_lock(&osd_mutex);
+        pthread_mutex_lock(&st3m_osd_mutex);
     }
     return ctx;
 }
 
-void st3m_ctx_viewport_transform(Ctx *ctx) {
+void st3m_gfx_viewport_transform(Ctx *ctx) {
     int32_t offset_x = FLOW3R_BSP_DISPLAY_WIDTH / 2;
     int32_t offset_y = FLOW3R_BSP_DISPLAY_HEIGHT / 2;
     ctx_identity(ctx);  // this might break/need revisiting with tiled rendering
@@ -150,17 +162,12 @@ static void xQueueReceiveNotifyStarved(QueueHandle_t q, void *dst,
     }
 }
 
-void st3m_ctx_merge_osd(uint16_t *fb, uint8_t *osd, int ostride,
-                        uint16_t *osd_backup, int x0, int y0, int w, int h);
-void st3m_ctx_unmerge_osd(uint16_t *fb, uint16_t *osd_backup, int x0, int y0,
-                          int w, int h);
-
 float st3m_gfx_fps(void) { return smoothed_fps; }
 
 void st3m_gfx_set_palette(uint8_t *pal_in, int count) {
     if (count > 256) count = 256;
     if (count < 0) count = 0;
-    uint8_t *pal = ((uint8_t *)st3m_osd_fb) + sizeof(st3m_osd_fb) - 256 * 3;
+    uint8_t *pal = ((uint8_t *)st3m_fb) + sizeof(st3m_fb) - 256 * 3;
     for (int i = 0; i < count * 3; i++) pal[i] = pal_in[i];
 }
 
@@ -178,116 +185,115 @@ static void ega_palette(void) {
     st3m_gfx_set_palette(pal, 16);
 }
 
-static void fire_palette(void) {
+static void gray_palette(void) {
     uint8_t pal[256 * 3];
     for (int i = 0; i < 256; i++) {
         pal[i * 3 + 0] = i;
+        pal[i * 3 + 1] = i;
+        pal[i * 3 + 2] = i;
+    }
+    st3m_gfx_set_palette(pal, 256);
+}
+
+static void sepia_palette(void) {
+    uint8_t pal[256 * 3];
+    for (int i = 0; i < 256; i++) {
+        pal[i * 3 + 0] = i;
+        pal[i * 3 + 1] = (i / 255.0) * (i / 255.0) * 255;
+        pal[i * 3 + 2] = (i / 255.0) * (i / 255.0) * (i / 255.0) * 255;
+    }
+    st3m_gfx_set_palette(pal, 256);
+}
+
+static void cool_palette(void) {
+    uint8_t pal[256 * 3];
+    for (int i = 0; i < 256; i++) {
+        pal[i * 3 + 0] = (i / 255.0) * (i / 255.0) * (i / 255.0) * 255;
+        pal[i * 3 + 1] = (i / 255.0) * (i / 255.0) * 255;
+        pal[i * 3 + 2] = i;
+    }
+    st3m_gfx_set_palette(pal, 256);
+}
+
+static void red_palette(void) {
+    uint8_t pal[256 * 3];
+    for (int i = 0; i < 256; i++) {
+        pal[i * 3 + 0] = i < 64 ? (i * 4) : 255;
         pal[i * 3 + 1] = (i / 255.0) * (i / 255.0) * 255;
         pal[i * 3 + 2] = (i / 255.0) * (i / 255.0) * (i / 255.0) * 255;
     }
     st3m_gfx_set_palette(pal, 256);
 }
 
+void st3m_gfx_set_default_mode(st3m_gfx_mode mode) {
+    default_mode = mode;
+    _st3m_gfx_mode = default_mode + 1;
+    st3m_gfx_set_mode(st3m_gfx_default);
+}
+
 void st3m_gfx_set_mode(st3m_gfx_mode mode) {
     if (mode == _st3m_gfx_mode) return;
 
-    memset(fb, 0, sizeof(fb));
+    memset(st3m_fb, 0, sizeof(st3m_fb));
     memset(st3m_osd_fb, 0, sizeof(st3m_osd_fb));
 
     if (mode == st3m_gfx_default)
-        mode = ST3M_GFX_DEFAULT_MODE;
+        mode = default_mode;
     else if (mode == st3m_gfx_low_latency)
-        mode = ST3M_GFX_DEFAULT_MODE + st3m_gfx_low_latency;
+        mode = default_mode | st3m_gfx_low_latency;
+    else if (mode == st3m_gfx_osd)
+        mode = default_mode | st3m_gfx_osd;
 
-    switch (((int)mode) & ~3) {
-        case st3m_gfx_4bpp:
+    switch (mode & 0xf) {
+        case 4:
             ega_palette();
             break;
-        case st3m_gfx_8bpp:
-        case st3m_gfx_8bpp_osd:
-        case st3m_gfx_8bpp_low_latency:
-            fire_palette();
+        case 8:
+            sepia_palette();
+            break;
+        case 9:
+            red_palette();
+            break;
+        case 10:
+            gray_palette();
+            break;
+        case 11:
+            cool_palette();
             break;
     }
 
-    if (mode == ST3M_GFX_DEFAULT_MODE)
+    if (mode == default_mode)
         _st3m_gfx_mode = st3m_gfx_default;
-    else if (mode == (ST3M_GFX_DEFAULT_MODE + st3m_gfx_low_latency))
+    else if (mode == (default_mode + st3m_gfx_low_latency))
         _st3m_gfx_mode = st3m_gfx_low_latency;
     else
         _st3m_gfx_mode = mode;
 
     _st3m_gfx_low_latency = ((mode & st3m_gfx_low_latency) != 0);
+    switch ((int)mode) {
+        case st3m_gfx_16bpp_direct_ctx:
+        case st3m_gfx_16bpp_direct_ctx_osd:
+        case st3m_gfx_8bpp_direct_ctx:
+        case st3m_gfx_32bpp_direct_ctx:
+        case st3m_gfx_24bpp_direct_ctx:
+            _st3m_gfx_low_latency = 1;
+    }
 }
 
 st3m_gfx_mode st3m_gfx_get_mode(void) { return _st3m_gfx_mode; }
 
 uint8_t *st3m_gfx_fb(st3m_gfx_mode mode) {
-    st3m_gfx_mode set_mode =
-        _st3m_gfx_mode ? _st3m_gfx_mode : ST3M_GFX_DEFAULT_MODE;
+    st3m_gfx_mode set_mode = _st3m_gfx_mode ? _st3m_gfx_mode : default_mode;
     if (mode == st3m_gfx_default) {
-        switch (set_mode) {
-            case st3m_gfx_default:
-            case st3m_gfx_low_latency:
-            case st3m_gfx_16bpp:
-            case st3m_gfx_16bpp_osd:
-            case st3m_gfx_16bpp_low_latency:
-                return (uint8_t *)fb;
-            case st3m_gfx_32bpp:
-            case st3m_gfx_32bpp_osd:
-            case st3m_gfx_32bpp_low_latency:
-            case st3m_gfx_24bpp:
-            case st3m_gfx_24bpp_low_latency:
-            case st3m_gfx_8bpp:
-            case st3m_gfx_8bpp_osd:
-            case st3m_gfx_8bpp_low_latency:
-            case st3m_gfx_osd:
-            case st3m_gfx_4bpp:
-                return st3m_osd_fb;
-        }
-    }
-    if (mode == st3m_gfx_osd) {
-        switch (set_mode) {
-            case st3m_gfx_default:
-            case st3m_gfx_low_latency:
-            case st3m_gfx_16bpp:
-            case st3m_gfx_16bpp_osd:
-            case st3m_gfx_16bpp_low_latency:
-            case st3m_gfx_osd:
-                return st3m_osd_fb;
-            case st3m_gfx_32bpp:
-            case st3m_gfx_32bpp_osd:
-            case st3m_gfx_32bpp_low_latency:
-            case st3m_gfx_24bpp:
-            case st3m_gfx_24bpp_low_latency:
-            case st3m_gfx_8bpp:
-            case st3m_gfx_8bpp_osd:
-            case st3m_gfx_8bpp_low_latency:
-            case st3m_gfx_4bpp:
-                return (uint8_t *)fb;
-        }
+        if (st3m_gfx_bpp(set_mode) <= 16) return (uint8_t *)st3m_fb;
+        return st3m_osd_fb;
+    } else if (mode == st3m_gfx_osd) {
+        if (st3m_gfx_bpp(set_mode) <= 16) return st3m_osd_fb;
+        return (uint8_t *)st3m_fb;
     }
 
-    switch (set_mode) {
-        case st3m_gfx_default:
-        case st3m_gfx_low_latency:
-        case st3m_gfx_16bpp:
-        case st3m_gfx_16bpp_osd:
-        case st3m_gfx_16bpp_low_latency:
-            return (uint8_t *)fb;
-        case st3m_gfx_4bpp:
-        case st3m_gfx_32bpp:
-        case st3m_gfx_32bpp_osd:
-        case st3m_gfx_32bpp_low_latency:
-        case st3m_gfx_24bpp:
-        case st3m_gfx_24bpp_low_latency:
-        case st3m_gfx_8bpp:
-        case st3m_gfx_8bpp_osd:
-        case st3m_gfx_8bpp_low_latency:
-        case st3m_gfx_osd:
-            return st3m_osd_fb;
-    }
-    return (uint8_t *)fb;
+    if (st3m_gfx_bpp(set_mode) <= 16) return (uint8_t *)st3m_fb;
+    return st3m_osd_fb;
 }
 
 static void st3m_gfx_task(void *_arg) {
@@ -304,70 +310,50 @@ static void st3m_gfx_task(void *_arg) {
         ctx_set_textureclock(fb_RGB565_BS_ctx,
                              ctx_textureclock(fb_RGB565_BS_ctx) + 1);
         ctx_set_textureclock(fb_RGBA8_ctx, ctx_textureclock(fb_RGB565_BS_ctx));
+        ctx_set_textureclock(fb_RGB8_ctx, ctx_textureclock(fb_RGB565_BS_ctx));
         ctx_set_textureclock(fb_GRAY8_ctx, ctx_textureclock(fb_RGB565_BS_ctx));
         ctx_set_textureclock(fb_GRAYA8_ctx, ctx_textureclock(fb_RGB565_BS_ctx));
 
-        st3m_gfx_mode set_mode =
-            _st3m_gfx_mode ? _st3m_gfx_mode : ST3M_GFX_DEFAULT_MODE;
+        st3m_gfx_mode set_mode = _st3m_gfx_mode ? _st3m_gfx_mode : default_mode;
 
-        switch (set_mode) {
-            case st3m_gfx_4bpp:
-                flow3r_bsp_display_send_fb(st3m_osd_fb, 4);
-                break;
-            case st3m_gfx_8bpp:
-            case st3m_gfx_8bpp_osd:
-            case st3m_gfx_8bpp_low_latency:
-                ctx_render_ctx(drawlist->user_ctx, fb_GRAY8_ctx);
-                flow3r_bsp_display_send_fb(st3m_osd_fb, 8);
-                break;
-            case st3m_gfx_24bpp:
-            case st3m_gfx_24bpp_low_latency:
-                flow3r_bsp_display_send_fb(st3m_osd_fb, 24);
-                break;
-            case st3m_gfx_32bpp:
-            case st3m_gfx_32bpp_low_latency:
-                ctx_render_ctx(drawlist->user_ctx, fb_RGBA8_ctx);
-                flow3r_bsp_display_send_fb(st3m_osd_fb, 32);
+        Ctx *user_target = fb_RGB565_BS_ctx;
+        void *user_fb = st3m_fb;
+        void *osd_fb = st3m_osd_fb;
+        int bits = st3m_gfx_bpp(set_mode);
+        switch (bits) {
+            case 4:
                 break;
-            case st3m_gfx_32bpp_osd:
-                ctx_render_ctx(drawlist->user_ctx, fb_RGBA8_ctx);
-                flow3r_bsp_display_send_fb(st3m_osd_fb, 32);
+            case 8:
+                user_target = fb_GRAY8_ctx;
                 break;
-            case st3m_gfx_osd:
-                flow3r_bsp_display_send_fb(st3m_osd_fb, 32);
+            case 24:
+                user_target = fb_RGB8_ctx;
+                user_fb = st3m_osd_fb;
+                osd_fb = st3m_fb;
                 break;
-            case st3m_gfx_16bpp:
-            case st3m_gfx_16bpp_low_latency:
-            case st3m_gfx_low_latency:
-                ctx_render_ctx(drawlist->user_ctx, fb_RGB565_BS_ctx);
-                flow3r_bsp_display_send_fb(fb, 16);
-                break;
-            case st3m_gfx_default:  // not neccesarily taken- overrriden above
-            case st3m_gfx_16bpp_osd:
-                ctx_render_ctx(drawlist->user_ctx, fb_RGB565_BS_ctx);
-                if (drawlist->osd_y0 != drawlist->osd_y1) {
-                    pthread_mutex_lock(&osd_mutex);
-                    st3m_ctx_merge_osd(
-                        fb,
-                        st3m_osd_fb + 4 * ((drawlist->osd_y0 - ST3M_OSD_Y) *
-                                               ST3M_OSD_WIDTH +
-                                           (drawlist->osd_x0 - ST3M_OSD_X)),
-                        ST3M_OSD_WIDTH * 4, st3m_osd_backup, drawlist->osd_x0,
-                        drawlist->osd_y0,
-                        drawlist->osd_x1 - drawlist->osd_x0 + 1,
-                        drawlist->osd_y1 - drawlist->osd_y0 + 1);
-                    pthread_mutex_unlock(&osd_mutex);
-                    flow3r_bsp_display_send_fb(fb, 16);
-                    st3m_ctx_unmerge_osd(
-                        fb, st3m_osd_backup, drawlist->osd_x0, drawlist->osd_y0,
-                        drawlist->osd_x1 - drawlist->osd_x0 + 1,
-                        drawlist->osd_y1 - drawlist->osd_y0 + 1);
-                } else
-                    flow3r_bsp_display_send_fb(fb, 16);
+            case 32:
+                user_target = fb_RGBA8_ctx;
+                user_fb = st3m_osd_fb;
+                osd_fb = st3m_fb;
                 break;
         }
+
+        if ((set_mode & st3m_gfx_direct_ctx) == 0)
+            ctx_render_ctx(drawlist->user_ctx, user_target);
+
+        if ((set_mode & st3m_gfx_osd) &&
+            (drawlist->osd_y0 != drawlist->osd_y1)) {
+            pthread_mutex_lock(&st3m_osd_mutex);
+            flow3r_bsp_display_send_fb_osd(user_fb, bits, osd_fb,
+                                           drawlist->osd_x0, drawlist->osd_y0,
+                                           drawlist->osd_x1, drawlist->osd_y1);
+            pthread_mutex_unlock(&st3m_osd_mutex);
+        } else {
+            flow3r_bsp_display_send_fb(user_fb, bits);
+        }
+
         ctx_drawlist_clear(drawlist->user_ctx);
-        st3m_ctx_viewport_transform(drawlist->user_ctx);
+        st3m_gfx_viewport_transform(drawlist->user_ctx);
 
         xQueueSend(user_ctx_freeq, &desc_no, portMAX_DELAY);
         st3m_counter_rate_sample(&rast_rate);
@@ -612,7 +598,7 @@ void st3m_gfx_show_textview(st3m_gfx_textview_t *tv) {
 
     ctx_restore(ctx);
 
-    st3m_gfx_drawctx_pipe_put();
+    st3m_gfx_pipe_put();
 }
 
 void st3m_gfx_init(void) {
@@ -630,29 +616,38 @@ void st3m_gfx_init(void) {
 
     // Setup rasterizers for frame buffer formats
     fb_GRAY8_ctx = ctx_new_for_framebuffer(
-        st3m_osd_fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
+        st3m_fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
         FLOW3R_BSP_DISPLAY_WIDTH, CTX_FORMAT_GRAY8);
     fb_GRAYA8_ctx = ctx_new_for_framebuffer(
-        st3m_osd_fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
+        st3m_fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
         FLOW3R_BSP_DISPLAY_WIDTH * 2, CTX_FORMAT_GRAYA8);
     fb_RGB565_BS_ctx = ctx_new_for_framebuffer(
-        fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
+        st3m_fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
         FLOW3R_BSP_DISPLAY_WIDTH * 2, CTX_FORMAT_RGB565_BYTESWAPPED);
     fb_RGBA8_ctx = ctx_new_for_framebuffer(
         st3m_osd_fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
         FLOW3R_BSP_DISPLAY_WIDTH * 4, CTX_FORMAT_RGBA8);
+    fb_RGB8_ctx = ctx_new_for_framebuffer(
+        st3m_osd_fb, FLOW3R_BSP_DISPLAY_WIDTH, FLOW3R_BSP_DISPLAY_HEIGHT,
+        FLOW3R_BSP_DISPLAY_WIDTH * 3, CTX_FORMAT_RGB8);
     assert(fb_GRAY8_ctx != NULL);
     assert(fb_GRAYA8_ctx != NULL);
     assert(fb_RGB565_BS_ctx != NULL);
     assert(fb_RGBA8_ctx != NULL);
+    assert(fb_RGB8_ctx != NULL);
 
-    st3m_ctx_viewport_transform(fb_GRAY8_ctx);
-    st3m_ctx_viewport_transform(fb_GRAYA8_ctx);
-    st3m_ctx_viewport_transform(fb_RGB565_BS_ctx);
-    st3m_ctx_viewport_transform(fb_RGBA8_ctx);
+    st3m_gfx_viewport_transform(fb_GRAY8_ctx);
+    st3m_gfx_viewport_transform(fb_GRAYA8_ctx);
+    st3m_gfx_viewport_transform(fb_RGB565_BS_ctx);
+    st3m_gfx_viewport_transform(fb_RGBA8_ctx);
+    st3m_gfx_viewport_transform(fb_RGB8_ctx);
 
     ctx_set_texture_source(fb_RGBA8_ctx, fb_RGB565_BS_ctx);
     ctx_set_texture_cache(fb_RGBA8_ctx, fb_RGB565_BS_ctx);
+    ctx_set_texture_source(fb_RGB8_ctx, fb_RGB565_BS_ctx);
+    ctx_set_texture_cache(fb_RGB8_ctx, fb_RGB565_BS_ctx);
+    ctx_set_texture_source(fb_GRAY8_ctx, fb_RGB565_BS_ctx);
+    ctx_set_texture_cache(fb_GRAY8_ctx, fb_RGB565_BS_ctx);
 
     // Setup user_ctx descriptor.
     for (int i = 0; i < N_DRAWLISTS; i++) {
@@ -661,7 +656,7 @@ void st3m_gfx_init(void) {
         assert(drawlists[i].user_ctx != NULL);
         ctx_set_texture_cache(drawlists[i].user_ctx, fb_RGB565_BS_ctx);
 
-        st3m_ctx_viewport_transform(drawlists[i].user_ctx);
+        st3m_gfx_viewport_transform(drawlists[i].user_ctx);
 
         BaseType_t res = xQueueSend(user_ctx_freeq, &i, 0);
         assert(res == pdTRUE);
@@ -684,23 +679,39 @@ static Ctx *st3m_gfx_drawctx_free_get(TickType_t ticks_to_wait) {
     drawlist->osd_x1 = _st3m_osd_x1;
     drawlist->osd_y1 = _st3m_osd_y1;
 
+    st3m_gfx_mode set_mode = _st3m_gfx_mode ? _st3m_gfx_mode : default_mode;
+    switch ((int)set_mode) {
+        case st3m_gfx_16bpp_direct_ctx:
+        case st3m_gfx_16bpp_direct_ctx_osd:
+            st3m_gfx_viewport_transform(fb_RGB565_BS_ctx);
+            return fb_RGB565_BS_ctx;
+    }
+
     return drawlist->user_ctx;
 }
 
-static void st3m_gfx_drawctx_pipe_put(void) {
+static void st3m_gfx_pipe_put(void) {
     xQueueSend(user_ctx_rastq, &last_descno, portMAX_DELAY);
 }
 
-static Ctx *st3m_ctx_int(st3m_gfx_mode mode);
-void st3m_ctx_end_frame(Ctx *ctx) {
-    if (ctx == st3m_ctx_int(st3m_gfx_osd)) {
-        pthread_mutex_unlock(&osd_mutex);
+static Ctx *st3m_gfx_ctx_int(st3m_gfx_mode mode);
+void st3m_gfx_end_frame(Ctx *ctx) {
+    if (ctx == st3m_gfx_ctx_int(st3m_gfx_osd)) {
+        pthread_mutex_unlock(&st3m_osd_mutex);
         return;
     }
-    st3m_gfx_drawctx_pipe_put();
+    st3m_gfx_pipe_put();
 }
 
-uint8_t st3m_gfx_drawctx_pipe_full(void) {
+uint8_t st3m_gfx_pipe_full(void) {
+    st3m_gfx_mode mode = _st3m_gfx_mode ? _st3m_gfx_mode : default_mode;
+    switch ((int)mode) {
+        case st3m_gfx_16bpp_direct_ctx:
+        case st3m_gfx_16bpp_direct_ctx_osd:
+            return (uxQueueSpacesAvailable(user_ctx_rastq) == 0) ||
+                   (uxQueueMessagesWaiting(user_ctx_freeq) <=
+                    _st3m_gfx_low_latency);
+    }
     return uxQueueMessagesWaiting(user_ctx_freeq) <= _st3m_gfx_low_latency;
 }
 
@@ -720,7 +731,7 @@ void st3m_gfx_flush(int timeout_ms) {
 
     for (int i = 0; i < N_DRAWLISTS; i++) {
         ctx_drawlist_clear(drawlists[i].user_ctx);
-        st3m_ctx_viewport_transform(drawlists[i].user_ctx);
+        st3m_gfx_viewport_transform(drawlists[i].user_ctx);
         BaseType_t res = xQueueSend(user_ctx_freeq, &i, 0);
         assert(res == pdTRUE);
     }
diff --git a/components/st3m/st3m_gfx.h b/components/st3m/st3m_gfx.h
index bd79c8bb38c1cc92f8986cb1f630880faff54621..038e072a94db4f7880ef33d26ce8f13b33bd4d36 100644
--- a/components/st3m/st3m_gfx.h
+++ b/components/st3m/st3m_gfx.h
@@ -9,31 +9,48 @@
 
 typedef enum {
     st3m_gfx_default = 0,
-    // bitmask flag over base bpp to turn on OSD, only 16bpp for now
-    st3m_gfx_osd = 1,
+    // bitmask flag over base bpp to turn on OSD, only 16bpp for now will
+    // become available for other bitdepths as grayscale rather than color
+    // overlays.
+    st3m_gfx_direct_ctx = 128,
+    st3m_gfx_osd = 256,
     // shallower pipeline, in the future might mean immediate mode
-    st3m_gfx_low_latency = 2,
+    st3m_gfx_low_latency = 512,
+    st3m_gfx_unset = 1024,
     // 4 and 8bpp modes use the configured palette, the palette resides
     // in video ram and is lost upon mode change
     st3m_gfx_4bpp = 4,
+    // a flag for modes >4bpp requesting that ctx calls are direct, this is
+    // slower since micropython cannot run in parallell with rasterization.
     st3m_gfx_8bpp = 8,
-    st3m_gfx_8bpp_osd,
-    st3m_gfx_8bpp_low_latency,
+    st3m_gfx_8bpp_osd = 8 + st3m_gfx_osd,
+    st3m_gfx_8bpp_direct_ctx = 8 + st3m_gfx_direct_ctx,
+    st3m_gfx_8bpp_low_latency = 8 + st3m_gfx_low_latency,
+    st3m_gfx_8bpp_osd_low_latency = 8 + st3m_gfx_osd + st3m_gfx_low_latency,
     // 16bpp modes have the lowest blit overhead - no osd for now
     st3m_gfx_16bpp = 16,
-    st3m_gfx_16bpp_osd,
-    st3m_gfx_16bpp_low_latency,
+    st3m_gfx_16bpp_osd = 16 + st3m_gfx_osd,
+    st3m_gfx_16bpp_low_latency = 16 + st3m_gfx_low_latency,
+    st3m_gfx_16bpp_direct_ctx = 16 + st3m_gfx_direct_ctx,
+    st3m_gfx_16bpp_direct_ctx_osd = 16 + st3m_gfx_direct_ctx + st3m_gfx_osd,
     // for pixel poking 24bpp might be a little faster than 32bpp
     // for now there is no ctx drawing support in 24bpp mode.
     st3m_gfx_24bpp = 24,
-    st3m_gfx_24bpp_low_latency = 26,
+    st3m_gfx_24bpp_osd = 24 + st3m_gfx_osd,
+    st3m_gfx_24bpp_direct_ctx = 24 + st3m_gfx_direct_ctx,
+    st3m_gfx_24bpp_low_latency = 24 + st3m_gfx_low_latency,
     st3m_gfx_32bpp = 32,
     // 32bpp modes - are faster at doing compositing, for solid text/fills
-    // 16bpp is probabl faster.
-    st3m_gfx_32bpp_osd,
-    st3m_gfx_32bpp_low_latency,
+    // 16bpp is probably faster.
+    st3m_gfx_32bpp_osd = 32 + st3m_gfx_osd,
+    st3m_gfx_32bpp_low_latency = 32 + st3m_gfx_low_latency,
+    st3m_gfx_32bpp_direct_ctx = 32 + st3m_gfx_direct_ctx,
 } st3m_gfx_mode;
 
+// sets the system graphics mode, this is the mode you get to
+// when calling st3m_gfx_set_mode(st3m_gfx_default);
+void st3m_gfx_set_default_mode(st3m_gfx_mode mode);
+
 // sets the current graphics mode
 void st3m_gfx_set_mode(st3m_gfx_mode mode);
 
@@ -42,15 +59,18 @@ st3m_gfx_mode st3m_gfx_get_mode(void);
 
 // returns a ctx for drawing at the specified mode/target
 // should be paired with a st3m_ctx_end_frame
-// normal values are 0 and 1 for base framebuffer of current
-// mode and st3m_gfx_osd for getting the overlay drawing context.
-Ctx *st3m_ctx(st3m_gfx_mode mode);
+// normal values are st3m_gfx_default and st3m_gfx_osd for base framebuffer
+// and overlay drawing context.
+Ctx *st3m_gfx_ctx(st3m_gfx_mode mode);
 
 // get the framebuffer associated with graphics mode
 // if you ask for st3m_gfx_default you get the current modes fb
 // and if you ask for st3m_gfx_osd you get the current modes overlay fb
 uint8_t *st3m_gfx_fb(st3m_gfx_mode mode);
 
+// get the bits per pixel for a given mode
+int st3m_gfx_bpp(st3m_gfx_mode mode);
+
 // sets the palette, pal_in is an array with 3 uint8_t's per entry,
 // support values for count is 1-256, used only in 4bpp and 8bpp
 // graphics modes.
@@ -65,14 +85,14 @@ float st3m_gfx_fps(void);
 
 // temporary, signature compatible
 // with ctx_end_frame()
-void st3m_ctx_end_frame(Ctx *ctx);
+void st3m_gfx_end_frame(Ctx *ctx);
 
 // Initialize the gfx subsystem of st3m, includng the rasterization and
 // crtx/blitter pipeline.
 void st3m_gfx_init(void);
 
-// Returns true if the rasterizaiton pipeline submission would block.
-uint8_t st3m_gfx_drawctx_pipe_full(void);
+// Returns true if we right now cannot accept another frame
+uint8_t st3m_gfx_pipe_full(void);
 
 // Flush any in-flight pipelined work, resetting the free ctx/framebuffer queues
 // to their initial state. This should be called if there has been any drawlist
diff --git a/python_payload/apps/clouds/__init__.py b/python_payload/apps/clouds/__init__.py
index 1f5263f47bf64997a69dd5b2a9e9d6ce3475ad71..3adc4e8951f4b649c0dcc6d1ba67a9679cb5754d 100644
--- a/python_payload/apps/clouds/__init__.py
+++ b/python_payload/apps/clouds/__init__.py
@@ -58,12 +58,6 @@ class App(Application):
                 c.x = -200
         self.clouds = sorted(self.clouds, key=lambda c: -c.z)
 
-    def on_enter(self, vm):
-        # we are compositing heavy and going back and forth to 16bit is
-        # too much overhead
-        sys_display.set_mode(32)
-        super().on_enter(vm)
-
     def draw(self, ctx):
         # faster, and with smoothing is incorrect
         ctx.image_smoothing = False
@@ -76,3 +70,9 @@ class App(Application):
 
         for c in self.clouds:
             c.draw(ctx)
+
+
+if __name__ == "__main__":
+    from st3m.run import run_app
+
+    run_app(App)
diff --git a/python_payload/st3m/run.py b/python_payload/st3m/run.py
index 885cd6578a21b9fb20c361ac1a021da3e33eac0a..6816ac113d3a30b46854e644fd0e4b7cd87c3e21 100644
--- a/python_payload/st3m/run.py
+++ b/python_payload/st3m/run.py
@@ -20,7 +20,7 @@ from st3m.application import (
 from st3m.about import About
 from st3m import settings_menu as settings, logging, processors, wifi
 
-import captouch, audio, leds, gc, sys_buttons
+import captouch, audio, leds, gc, sys_buttons, sys_display
 import os
 
 import machine
@@ -135,6 +135,54 @@ def run_app(klass):
     run_view(klass(ApplicationContext()))
 
 
+# 256 is overlay
+#
+
+
+def _8bpp() -> None:
+    sys_display.set_default_mode(8 + 256)
+
+
+def _8bpp_pal1() -> None:
+    sys_display.set_default_mode(9 + 256)
+
+
+def _8bpp_pal2() -> None:
+    sys_display.set_default_mode(10 + 256)
+
+
+def _8bpp_pal3() -> None:
+    sys_display.set_default_mode(11 + 256)
+
+
+def _8bpp_RGB332() -> None:
+    sys_display.set_default_mode(12 + 256)
+
+
+def _8bpp_3x() -> None:
+    sys_display.set_default_mode(8 + 256 + 4096)
+
+
+def _8bpp_low_latency() -> None:
+    sys_display.set_default_mode(8 + 256 + 512)
+
+
+def _16bpp_low_latency() -> None:
+    sys_display.set_default_mode(16 + 256 + 512)
+
+
+def _16bpp() -> None:
+    sys_display.set_default_mode(16 + 256)
+
+
+def _24bpp() -> None:
+    sys_display.set_default_mode(24 + 256)
+
+
+def _32bpp() -> None:
+    sys_display.set_default_mode(32 + 256)
+
+
 def _yeet_local_changes() -> None:
     os.remove("/flash/sys/.sys-installed")
     machine.reset()
@@ -165,10 +213,27 @@ def run_main() -> None:
         log.error(f"Failed to set hostname {e}")
 
     menu_settings = settings.build_menu()
+    menu_gfx = SimpleMenu(
+        [
+            MenuItemBack(),
+            MenuItemForeground("Graphics Mode", menu_settings),
+            MenuItemAction("8bpp", _8bpp),
+            MenuItemAction("8bpp-low latency", _8bpp_low_latency),
+            # MenuItemAction("8bpp_3x", _8bpp_3x),
+            MenuItemAction("16bpp", _16bpp),
+            MenuItemAction("16bpp-low latency", _16bpp_low_latency),
+            MenuItemAction("24bpp", _24bpp),
+            MenuItemAction("8bpp Red", _8bpp_pal1),
+            MenuItemAction("8bpp Grayscale", _8bpp_pal2),
+            MenuItemAction("8bpp Cool", _8bpp_pal3),
+            MenuItemAction("8bpp RGB332", _8bpp_RGB332),
+        ],
+    )
     menu_system = SimpleMenu(
         [
             MenuItemBack(),
             MenuItemForeground("Settings", menu_settings),
+            MenuItemForeground("Graphics Mode", menu_gfx),
             MenuItemAppLaunch(BundleMetadata("/flash/sys/apps/gr33nhouse")),
             MenuItemAction("Disk Mode (Flash)", machine.disk_mode_flash),
             MenuItemAction("Disk Mode (SD)", machine.disk_mode_sd),
diff --git a/python_payload/st3m/ui/elements/overlays.py b/python_payload/st3m/ui/elements/overlays.py
index aa016809ec7e8981ca17086b16874dbdc2c60fde..7c8eacdeec55c3b0cdcac3e2d00e65549db4931f 100644
--- a/python_payload/st3m/ui/elements/overlays.py
+++ b/python_payload/st3m/ui/elements/overlays.py
@@ -97,7 +97,7 @@ class Compositor(Responder):
         if sys_display.get_mode() != 0:
             return
         if self._frame_skip <= 0:
-            octx = sys_display.ctx(1)  # XXX add symbolic name for overlay
+            octx = sys_display.ctx(256)  # XXX add symbolic name for overlay
             if settings.onoff_show_fps.value:
                 _clip_x0 = 110
                 _clip_y1 = 0
diff --git a/recovery/main/rec_fatal.c b/recovery/main/rec_fatal.c
index cdfa21e897ffd7a896b323a158818d5c9c8cbf58..373e27db28997305fbd269df480f0d4eb0384368 100644
--- a/recovery/main/rec_fatal.c
+++ b/recovery/main/rec_fatal.c
@@ -6,7 +6,7 @@
 
 void rec_fatal(const char *msg) {
     for (;;) {
-        Ctx *ctx = st3m_ctx(st3m_gfx_default);
+        Ctx *ctx = st3m_gfx_ctx(st3m_gfx_default);
 
         // Draw background.
         ctx_rgb(ctx, 0.29, 0.0, 0.0);
@@ -20,7 +20,7 @@ void rec_fatal(const char *msg) {
         ctx_move_to(ctx, 0, 0);
         ctx_text(ctx, msg);
 
-        st3m_ctx_end_frame(ctx);
+        st3m_gfx_end_frame(ctx);
 
         vTaskDelay(10 / portTICK_PERIOD_MS);
     }
diff --git a/recovery/main/rec_gui.c b/recovery/main/rec_gui.c
index 0c572b8835b7cd703db4de8ad0f6ec15e9fdadba..3c5fb91bff9c0fe89e079c9f8d3993bf18ae6c41 100644
--- a/recovery/main/rec_gui.c
+++ b/recovery/main/rec_gui.c
@@ -38,7 +38,7 @@ static void _header_draw(Ctx *ctx) {
 }
 
 void rec_erasing_draw(void) {
-    Ctx *ctx = st3m_ctx(st3m_gfx_default);
+    Ctx *ctx = st3m_gfx_ctx(st3m_gfx_default);
     _header_draw(ctx);
 
     ctx_move_to(ctx, 0, 0);
@@ -46,11 +46,11 @@ void rec_erasing_draw(void) {
     ctx_gray(ctx, 0.8);
     ctx_text(ctx, "Erasing...");
 
-    st3m_ctx_end_frame(ctx);
+    st3m_gfx_end_frame(ctx);
 }
 
 void rec_flashing_draw(int percent) {
-    Ctx *ctx = st3m_ctx(st3m_gfx_default);
+    Ctx *ctx = st3m_gfx_ctx(st3m_gfx_default);
     _header_draw(ctx);
 
     ctx_move_to(ctx, 0, 0);
@@ -64,11 +64,11 @@ void rec_flashing_draw(int percent) {
     ctx_rectangle(ctx, -120, 20, 240 * percent / 100, 20);
     ctx_fill(ctx);
 
-    st3m_ctx_end_frame(ctx);
+    st3m_gfx_end_frame(ctx);
 }
 
 void rec_menu_draw(menu_t *menu) {
-    Ctx *ctx = st3m_ctx(st3m_gfx_default);
+    Ctx *ctx = st3m_gfx_ctx(st3m_gfx_default);
     _header_draw(ctx);
 
     int y = -20;
@@ -112,7 +112,7 @@ void rec_menu_draw(menu_t *menu) {
         y += 18;
     }
 
-    st3m_ctx_end_frame(ctx);
+    st3m_gfx_end_frame(ctx);
 }
 
 void rec_menu_process(menu_t *menu) {