diff --git a/sim/fakes/_sim.py b/sim/fakes/_sim.py
index 087dcd2bc8c37b317f9356f95cfd1dc203d99d6a..0881f3136b1f2ec7160f25884dd1c16597bf8776 100644
--- a/sim/fakes/_sim.py
+++ b/sim/fakes/_sim.py
@@ -546,11 +546,23 @@ _sim = Simulation()
 class FramebufferManager:
     def __init__(self):
         self._free = []
+
+        # Significant difference between on-device Ctx and simulation Ctx: we
+        # render to a BRGA8 (24bpp color + 8bpp alpha) buffer instead of 16bpp
+        # RGB565 like the device does. This allows us to directly blit the ctx
+        # framebuffer into pygame's surfaces, which is a _huge_ speed benefit
+        # (difference between ~10FPS and 500+FPS!).
+
         for _ in range(1):
-            fb, c = ctx._wasm.ctx_new_for_framebuffer(240, 240)
+            fb, c = ctx._wasm.ctx_new_for_framebuffer(240, 240, 240 * 4, ctx.RGBA8)
             ctx._wasm.ctx_apply_transform(c, 1, 0, 120, 0, 1, 120, 0, 0, 1)
             self._free.append((fb, c))
 
+        self._overlay = ctx._wasm.ctx_new_for_framebuffer(240, 240, 240 * 4, ctx.RGBA8)
+        ctx._wasm.ctx_apply_transform(self._overlay[1], 1, 0, 120, 0, 1, 120, 0, 0, 1)
+
+        self._output = ctx._wasm.ctx_new_for_framebuffer(240, 240, 240 * 4, ctx.BGRA8)
+
     def get(self):
         if len(self._free) == 0:
             return None, None
@@ -562,8 +574,42 @@ class FramebufferManager:
     def put(self, fb, ctx):
         self._free.append((fb, ctx))
 
+    def get_overlay(self):
+        return self._overlay
+
+    def get_output(self, fbp):
+        return self._output
+
+    def draw(self, fb):
+        ctx._wasm.ctx_define_texture(
+            self._output[1], "!fb", 240, 240, 240 * 4, ctx.RGBA8, fb, 0
+        )
+        ctx._wasm.ctx_parse(self._output[1], "compositingMode copy")
+        ctx._wasm.ctx_draw_texture(self._output[1], "!fb", 0, 0, 240, 240)
+
+        if overlay_clip[2] and overlay_clip[3]:
+            ctx._wasm.ctx_define_texture(
+                self._output[1],
+                "!overlay",
+                240,
+                240,
+                240 * 4,
+                ctx.RGBA8,
+                self._overlay[0],
+                0,
+            )
+            ctx._wasm.ctx_parse(self._output[1], "compositingMode sourceOver")
+            ctx._wasm.ctx_draw_texture(self._output[1], "!overlay", 0, 0, 240, 240)
+
 
 fbm = FramebufferManager()
+overlay_ctxs = []
+overlay_clip = (0, 0, 240, 240)
+
+
+def set_overlay_clip(x, y, x2, y2):
+    global overlay_clip
+    overlay_clip = (x, y, x2 - x, y2 - y)
 
 
 def get_ctx():
@@ -573,19 +619,32 @@ def get_ctx():
 
 def get_overlay_ctx():
     dctx = ctx._wasm.ctx_new_drawlist(240, 240)
+    overlay_ctxs.append(dctx)
     return ctx.Context(dctx)
 
 
 def display_update(subctx):
     _sim.process_events()
+
+    if subctx._ctx in overlay_ctxs:
+        overlay_ctxs.remove(subctx._ctx)
+        fbp, c = fbm.get_overlay()
+        ctx._wasm.ctx_render_ctx(subctx._ctx, c)
+        ctx._wasm.ctx_destroy(subctx._ctx)
+        return
+
     fbp, c = fbm.get()
+
     if fbp is None:
         return
 
     ctx._wasm.ctx_render_ctx(subctx._ctx, c)
     ctx._wasm.ctx_destroy(subctx._ctx)
 
-    fb = ctx._wasm._i.exports.memory.uint8_view(fbp)
+    fbm.draw(fbp)
+
+    fb = ctx._wasm._i.exports.memory.uint8_view(fbm.get_output(fbp)[0])
+
     _sim.render_display(fb)
     _sim.render_gui_now()
 
diff --git a/sim/fakes/ctx.py b/sim/fakes/ctx.py
index 742455da344b331d4d1c59028f54c3a82e767a07..2ba82ec6ad5debd9341ab3f6f1448bf4e77047f2 100644
--- a/sim/fakes/ctx.py
+++ b/sim/fakes/ctx.py
@@ -46,20 +46,14 @@ class Wasm:
         self._i.exports.ctx_parse(ctx, p)
         self.free(p)
 
-    def ctx_new_for_framebuffer(self, width, height):
+    def ctx_new_for_framebuffer(self, width, height, stride, format):
         """
         Call ctx_new_for_framebuffer, but also first allocate the underlying
         framebuffer and return it alongside the Ctx*.
         """
-        fb = self.malloc(width * height * 4)
-        # Significant difference between on-device Ctx and simulation Ctx: we
-        # render to a BRGA8 (24bpp color + 8bpp alpha) buffer instead of 16bpp
-        # RGB565 like the device does. This allows us to directly blit the ctx
-        # framebuffer into pygame's surfaces, which is a _huge_ speed benefit
-        # (difference between ~10FPS and 500+FPS!).
-        BRGA8 = 5
+        fb = self.malloc(stride * height)
         return fb, self._i.exports.ctx_new_for_framebuffer(
-            fb, width, height, width * 4, BRGA8
+            fb, width, height, stride, format
         )
 
     def ctx_new_drawlist(self, width, height):
@@ -69,6 +63,29 @@ class Wasm:
         args = [float(a) for a in args]
         return self._i.exports.ctx_apply_transform(ctx, *args)
 
+    def ctx_define_texture(self, ctx, eid, *args):
+        s = eid.encode("utf-8")
+        slen = len(s) + 1
+        p = self.malloc(slen)
+        mem = self._i.exports.memory.uint8_view(p)
+        mem[0 : slen - 1] = s
+        mem[slen - 1] = 0
+        res = self._i.exports.ctx_define_texture(ctx, p, *args)
+        self.free(p)
+        return res
+
+    def ctx_draw_texture(self, ctx, eid, *args):
+        s = eid.encode("utf-8")
+        slen = len(s) + 1
+        p = self.malloc(slen)
+        mem = self._i.exports.memory.uint8_view(p)
+        mem[0 : slen - 1] = s
+        mem[slen - 1] = 0
+        args = [float(a) for a in args]
+        res = self._i.exports.ctx_draw_texture(ctx, p, *args)
+        self.free(p)
+        return res
+
     def ctx_text_width(self, ctx, text):
         s = text.encode("utf-8")
         slen = len(s) + 1
@@ -386,3 +403,8 @@ class Context:
         self.line_to(-130, 130)
         self.line_to(-130, 0)
         return self
+
+
+RGBA8 = 4
+BGRA8 = 5
+RGB565_BYTESWAPPED = 7
diff --git a/sim/fakes/sys_display.py b/sim/fakes/sys_display.py
index a967825791265772388351a389277d2349d7288c..acac485d2af60555c04e7ea215d1fe05fa31d060 100644
--- a/sim/fakes/sys_display.py
+++ b/sim/fakes/sys_display.py
@@ -9,12 +9,8 @@ def pipe_available():
     return True
 
 
-def overlay_clip(x0, y0, x1, y1):
-    pass
-
-
 def get_mode():
-    return 0
+    return osd
 
 
 def set_mode(no):
@@ -40,10 +36,13 @@ def fps():
 update = _sim.display_update
 get_ctx = _sim.get_ctx
 get_overlay_ctx = _sim.get_overlay_ctx
+overlay_clip = _sim.set_overlay_clip
 osd = 256
 
 
 def ctx(foo):
+    if foo == osd:
+        return _sim.get_overlay_ctx()
     return _sim.get_ctx()
 
 
diff --git a/sim/wasm/build.sh b/sim/wasm/build.sh
index 8055e671292427304a0621a3d28b7122cc249f0b..b66dbc532b845b40095811493468df3389c6906e 100755
--- a/sim/wasm/build.sh
+++ b/sim/wasm/build.sh
@@ -18,7 +18,7 @@ emcc ctx.c \
     -I ../../components/ctx/ \
     -I ../../components/ctx/fonts/ \
     -D SIMULATOR \
-    -s EXPORTED_FUNCTIONS=_ctx_new_for_framebuffer,_ctx_new_drawlist,_ctx_parse,_ctx_apply_transform,_ctx_text_width,_ctx_render_ctx,_ctx_destroy,_malloc,_free \
+    -s EXPORTED_FUNCTIONS=_ctx_new_for_framebuffer,_ctx_new_drawlist,_ctx_parse,_ctx_apply_transform,_ctx_text_width,_ctx_render_ctx,_ctx_define_texture,_ctx_draw_texture,_ctx_destroy,_malloc,_free \
     --no-entry -flto -O3 \
     -o ctx.wasm
 
diff --git a/sim/wasm/ctx.wasm b/sim/wasm/ctx.wasm
index 40f825db8028f177ffaecdef08caca646305f13e..a7bbe5dd123872f32db40f6074af8c82b8ae5b11 100755
Binary files a/sim/wasm/ctx.wasm and b/sim/wasm/ctx.wasm differ