diff --git a/py/runtime.c b/py/runtime.c
index b0f407a150dd415ec3fd20080f9dd48c64a955d9..e4a4d5b3f5fd6750d8fa98a36ded6385fd276e44 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -61,7 +61,6 @@ const mp_obj_module_t mp_module___main__ = {
 
 void mp_init(void) {
     qstr_init();
-    mp_stack_ctrl_init();
 
     // no pending exceptions to start with
     MP_STATE_VM(mp_pending_exception) = MP_OBJ_NULL;
diff --git a/qemu-arm/main.c b/qemu-arm/main.c
index f29547523410281ee3cc9b6b58c6043c7f51c788..b6ff73980c89d58b86794769783881433887a30d 100644
--- a/qemu-arm/main.c
+++ b/qemu-arm/main.c
@@ -32,6 +32,7 @@ void do_str(const char *src, mp_parse_input_kind_t input_kind) {
 }
 
 int main(int argc, char **argv) {
+    mp_stack_ctrl_init();
     mp_stack_set_limit(10240);
     void *heap = malloc(16 * 1024);
     gc_init(heap, (char*)heap + 16 * 1024);
diff --git a/qemu-arm/test_main.c b/qemu-arm/test_main.c
index a98e275cee4c51ba2c69f96f3ae6da0b47449bac..44f9cc6663bb079fdc3e63ffff5f1c886ead79a8 100644
--- a/qemu-arm/test_main.c
+++ b/qemu-arm/test_main.c
@@ -49,6 +49,7 @@ end:
 
 int main() {
     const char a[] = {"sim"};
+    mp_stack_ctrl_init();
     mp_stack_set_limit(10240);
     void *heap = malloc(256 * 1024);
     gc_init(heap, (char*)heap + 256 * 1024);
diff --git a/stmhal/main.c b/stmhal/main.c
index cce114605a52d06a983cb0a4d8952a3e4665c50d..0e5fc44af309cf04a3b116ca04b7984004ac2dfa 100644
--- a/stmhal/main.c
+++ b/stmhal/main.c
@@ -335,6 +335,7 @@ int main(void) {
 
     // Stack limit should be less than real stack size, so we have a chance
     // to recover from limit hit.  (Limit is measured in bytes.)
+    mp_stack_ctrl_init();
     mp_stack_set_limit((char*)&_ram_end - (char*)&_heap_end - 1024);
 
     /* STM32F4xx HAL library initialization:
diff --git a/teensy/main.c b/teensy/main.c
index b630e88867f3e2dd6d534acb84742acfd3dd1e24..41e445cb595b547c31099ea1b2d7f53ed99f252a 100644
--- a/teensy/main.c
+++ b/teensy/main.c
@@ -253,6 +253,7 @@ int main(void) {
     #define SCB_CCR_STKALIGN (1 << 9)
     SCB_CCR |= SCB_CCR_STKALIGN;
 
+    mp_stack_ctrl_init();
     mp_stack_set_limit(10240);
 
     pinMode(LED_BUILTIN, OUTPUT);
diff --git a/unix/main.c b/unix/main.c
index 03902a3e9a50c6c9f6c770835fb1e872106b21ed..4ba68dcb9b778c4229faff1e7f2bc346a839f22d 100644
--- a/unix/main.c
+++ b/unix/main.c
@@ -376,7 +376,19 @@ STATIC void set_sys_argv(char *argv[], int argc, int start_arg) {
 #define PATHLIST_SEP_CHAR ':'
 #endif
 
+MP_NOINLINE int main_(int argc, char **argv);
+
 int main(int argc, char **argv) {
+    // We should capture stack top ASAP after start, and it should be
+    // captured guaranteedly before any other stack variables are allocated.
+    // For this, actual main (renamed main_) should not be inlined into
+    // this function. main_() itself may have other functions inlined (with
+    // their own stack variables), that's why we need this main/main_ split.
+    mp_stack_ctrl_init();
+    return main_(argc, argv);
+}
+
+MP_NOINLINE int main_(int argc, char **argv) {
     mp_stack_set_limit(40000 * (BYTES_PER_WORD / 4));
 
     pre_process_options(argc, argv);
diff --git a/unix/mpconfigport.h b/unix/mpconfigport.h
index 2f992fdf03f40e3dc06e8e9760b4645b36a71ff4..f7fdeec07c94966696488ea0d0596b3b7c48641c 100644
--- a/unix/mpconfigport.h
+++ b/unix/mpconfigport.h
@@ -214,6 +214,10 @@ void mp_unix_mark_exec(void);
 #define MP_PLAT_ALLOC_EXEC(min_size, ptr, size) mp_unix_alloc_exec(min_size, ptr, size)
 #define MP_PLAT_FREE_EXEC(ptr, size) mp_unix_free_exec(ptr, size)
 
+#ifndef MP_NOINLINE
+#define MP_NOINLINE __attribute__((noinline))
+#endif
+
 #if MICROPY_PY_OS_DUPTERM
 #define MP_PLAT_PRINT_STRN(str, len) mp_hal_stdout_tx_strn_cooked(str, len)
 #else
diff --git a/windows/mpconfigport.h b/windows/mpconfigport.h
index 80166f1495aec4cd251cc798cbb3db6054c46e84..ad79ef3813bbf0dcee08af3860704e478bed6837 100644
--- a/windows/mpconfigport.h
+++ b/windows/mpconfigport.h
@@ -178,6 +178,10 @@ extern const struct _mp_obj_module_t mp_module_time;
 #include "init.h"
 #include "sleep.h"
 
+#ifdef __GNUC__
+#define MP_NOINLINE __attribute__((noinline))
+#endif
+
 // MSVC specifics
 #ifdef _MSC_VER
 
@@ -191,6 +195,7 @@ extern const struct _mp_obj_module_t mp_module_time;
 // CL specific overrides from mpconfig
 
 #define NORETURN                    __declspec(noreturn)
+#define MP_NOINLINE                 __declspec(noinline)
 #define MP_LIKELY(x)                (x)
 #define MP_UNLIKELY(x)              (x)
 #define MICROPY_PORT_CONSTANTS      { "dummy", 0 } //can't have zero-sized array