From ade9a052365247be5ed4ce8b53c2164a576a4a05 Mon Sep 17 00:00:00 2001
From: Damien George <damien.p.george@gmail.com>
Date: Sat, 13 Jun 2015 21:53:22 +0100
Subject: [PATCH] py: Improve allocation policy of qstr data.

Previous to this patch all interned strings lived in their own malloc'd
chunk.  On average this wastes N/2 bytes per interned string, where N is
the number-of-bytes for a quanta of the memory allocator (16 bytes on 32
bit archs).

With this patch interned strings are concatenated into the same malloc'd
chunk when possible.  Such chunks are enlarged inplace when possible,
and shrunk to fit when a new chunk is needed.

RAM savings with this patch are highly varied, but should always show an
improvement (unless only 3 or 4 strings are interned).  New version
typically uses about 70% of previous memory for the qstr data, and can
lead to savings of around 10% of total memory footprint of a running
script.

Costs about 120 bytes code size on Thumb2 archs (depends on how many
calls to gc_realloc are made).
---
 py/gc.c        |  7 ++++++-
 py/gc.h        |  2 +-
 py/malloc.c    | 11 +++++++----
 py/misc.h      |  8 ++++----
 py/mpconfig.h  |  8 ++++++++
 py/mpstate.h   |  6 ++++++
 py/objexcept.c |  2 +-
 py/parse.c     |  4 ++--
 py/qstr.c      | 41 ++++++++++++++++++++++++++++++++++++++++-
 9 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/py/gc.c b/py/gc.c
index 6cdafc03b..7283e5f1f 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -522,7 +522,7 @@ void *gc_realloc(void *ptr, mp_uint_t n_bytes) {
 
 #else // Alternative gc_realloc impl
 
-void *gc_realloc(void *ptr_in, mp_uint_t n_bytes) {
+void *gc_realloc(void *ptr_in, mp_uint_t n_bytes, bool allow_move) {
     if (MP_STATE_MEM(gc_lock_depth) > 0) {
         return NULL;
     }
@@ -624,6 +624,11 @@ void *gc_realloc(void *ptr_in, mp_uint_t n_bytes) {
         return ptr_in;
     }
 
+    if (!allow_move) {
+        // not allowed to move memory block so return failure
+        return NULL;
+    }
+
     // can't resize inplace; try to find a new contiguous chain
     void *ptr_out = gc_alloc(n_bytes,
 #if MICROPY_ENABLE_FINALISER
diff --git a/py/gc.h b/py/gc.h
index bb7e2d440..c61892f20 100644
--- a/py/gc.h
+++ b/py/gc.h
@@ -48,7 +48,7 @@ void gc_collect_end(void);
 void *gc_alloc(mp_uint_t n_bytes, bool has_finaliser);
 void gc_free(void *ptr);
 mp_uint_t gc_nbytes(const void *ptr);
-void *gc_realloc(void *ptr, mp_uint_t n_bytes);
+void *gc_realloc(void *ptr, mp_uint_t n_bytes, bool allow_move);
 
 typedef struct _gc_info_t {
     mp_uint_t total;
diff --git a/py/malloc.c b/py/malloc.c
index f8728c596..0aecbd71d 100644
--- a/py/malloc.c
+++ b/py/malloc.c
@@ -56,7 +56,10 @@
 #define malloc(b) gc_alloc((b), false)
 #define malloc_with_finaliser(b) gc_alloc((b), true)
 #define free gc_free
-#define realloc gc_realloc
+#define realloc(ptr, n) gc_realloc(ptr, n, true)
+#define realloc_ext(ptr, n, mv) gc_realloc(ptr, n, mv)
+#else
+#define realloc_ext(ptr, n, mv) realloc(ptr, n)
 #endif // MICROPY_ENABLE_GC
 
 void *m_malloc(size_t num_bytes) {
@@ -134,11 +137,11 @@ void *m_realloc(void *ptr, size_t new_num_bytes) {
 }
 
 #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
-void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes) {
+void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes, bool allow_move) {
 #else
-void *m_realloc_maybe(void *ptr, size_t new_num_bytes) {
+void *m_realloc_maybe(void *ptr, size_t new_num_bytes, bool allow_move) {
 #endif
-    void *new_ptr = realloc(ptr, new_num_bytes);
+    void *new_ptr = realloc_ext(ptr, new_num_bytes, allow_move);
 #if MICROPY_MEM_STATS
     // At first thought, "Total bytes allocated" should only grow,
     // after all, it's *total*. But consider for example 2K block
diff --git a/py/misc.h b/py/misc.h
index 91bd00096..1a0c085ac 100644
--- a/py/misc.h
+++ b/py/misc.h
@@ -63,12 +63,12 @@ typedef unsigned int uint;
 #endif
 #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
 #define m_renew(type, ptr, old_num, new_num) ((type*)(m_realloc((ptr), sizeof(type) * (old_num), sizeof(type) * (new_num))))
-#define m_renew_maybe(type, ptr, old_num, new_num) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (old_num), sizeof(type) * (new_num))))
+#define m_renew_maybe(type, ptr, old_num, new_num, allow_move) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (old_num), sizeof(type) * (new_num), (allow_move))))
 #define m_del(type, ptr, num) m_free(ptr, sizeof(type) * (num))
 #define m_del_var(obj_type, var_type, var_num, ptr) (m_free(ptr, sizeof(obj_type) + sizeof(var_type) * (var_num)))
 #else
 #define m_renew(type, ptr, old_num, new_num) ((type*)(m_realloc((ptr), sizeof(type) * (new_num))))
-#define m_renew_maybe(type, ptr, old_num, new_num) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (new_num))))
+#define m_renew_maybe(type, ptr, old_num, new_num, allow_move) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (new_num), (allow_move))))
 #define m_del(type, ptr, num) ((void)(num), m_free(ptr))
 #define m_del_var(obj_type, var_type, var_num, ptr) ((void)(var_num), m_free(ptr))
 #endif
@@ -80,11 +80,11 @@ void *m_malloc_with_finaliser(size_t num_bytes);
 void *m_malloc0(size_t num_bytes);
 #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
 void *m_realloc(void *ptr, size_t old_num_bytes, size_t new_num_bytes);
-void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes);
+void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes, bool allow_move);
 void m_free(void *ptr, size_t num_bytes);
 #else
 void *m_realloc(void *ptr, size_t new_num_bytes);
-void *m_realloc_maybe(void *ptr, size_t new_num_bytes);
+void *m_realloc_maybe(void *ptr, size_t new_num_bytes, bool allow_move);
 void m_free(void *ptr);
 #endif
 void *m_malloc_fail(size_t num_bytes);
diff --git a/py/mpconfig.h b/py/mpconfig.h
index e92e4c45d..0b86ec435 100644
--- a/py/mpconfig.h
+++ b/py/mpconfig.h
@@ -75,6 +75,14 @@
 #define MICROPY_ALLOC_GC_STACK_SIZE (64)
 #endif
 
+// Number of bytes to allocate initially when creating new chunks to store
+// interned string data.  Smaller numbers lead to more chunks being needed
+// and more wastage at the end of the chunk.  Larger numbers lead to wasted
+// space at the end when no more strings need interning.
+#ifndef MICROPY_ALLOC_QSTR_CHUNK_INIT
+#define MICROPY_ALLOC_QSTR_CHUNK_INIT (128)
+#endif
+
 // Initial amount for lexer indentation level
 #ifndef MICROPY_ALLOC_LEXER_INDENT_INIT
 #define MICROPY_ALLOC_LEXER_INDENT_INIT (10)
diff --git a/py/mpstate.h b/py/mpstate.h
index 42593e4c4..dd185a7c2 100644
--- a/py/mpstate.h
+++ b/py/mpstate.h
@@ -131,6 +131,12 @@ typedef struct _mp_state_vm_t {
     // END ROOT POINTER SECTION
     ////////////////////////////////////////////////////////////
 
+    // pointer and sizes to store interned string data
+    // (qstr_last_chunk can be root pointer but is also stored in qstr pool)
+    byte *qstr_last_chunk;
+    mp_uint_t qstr_last_alloc;
+    mp_uint_t qstr_last_used;
+
     // Stack top at the start of program
     // Note: this entry is used to locate the end of the root pointer section.
     char *stack_top;
diff --git a/py/objexcept.c b/py/objexcept.c
index 75369bac7..97536c6f4 100644
--- a/py/objexcept.c
+++ b/py/objexcept.c
@@ -444,7 +444,7 @@ void mp_obj_exception_add_traceback(mp_obj_t self_in, qstr file, mp_uint_t line,
         self->traceback_len = 0;
     } else if (self->traceback_len + 3 > self->traceback_alloc) {
         // be conservative with growing traceback data
-        mp_uint_t *tb_data = m_renew_maybe(mp_uint_t, self->traceback_data, self->traceback_alloc, self->traceback_alloc + 3);
+        mp_uint_t *tb_data = m_renew_maybe(mp_uint_t, self->traceback_data, self->traceback_alloc, self->traceback_alloc + 3, true);
         if (tb_data == NULL) {
             return;
         }
diff --git a/py/parse.c b/py/parse.c
index 35f5f8243..c00c590e9 100644
--- a/py/parse.c
+++ b/py/parse.c
@@ -135,7 +135,7 @@ STATIC void push_rule(parser_t *parser, mp_uint_t src_line, const rule_t *rule,
         return;
     }
     if (parser->rule_stack_top >= parser->rule_stack_alloc) {
-        rule_stack_t *rs = m_renew_maybe(rule_stack_t, parser->rule_stack, parser->rule_stack_alloc, parser->rule_stack_alloc + MICROPY_ALLOC_PARSE_RULE_INC);
+        rule_stack_t *rs = m_renew_maybe(rule_stack_t, parser->rule_stack, parser->rule_stack_alloc, parser->rule_stack_alloc + MICROPY_ALLOC_PARSE_RULE_INC, true);
         if (rs == NULL) {
             memory_error(parser);
             return;
@@ -293,7 +293,7 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
         return;
     }
     if (parser->result_stack_top >= parser->result_stack_alloc) {
-        mp_parse_node_t *stack = m_renew_maybe(mp_parse_node_t, parser->result_stack, parser->result_stack_alloc, parser->result_stack_alloc + MICROPY_ALLOC_PARSE_RESULT_INC);
+        mp_parse_node_t *stack = m_renew_maybe(mp_parse_node_t, parser->result_stack, parser->result_stack_alloc, parser->result_stack_alloc + MICROPY_ALLOC_PARSE_RESULT_INC, true);
         if (stack == NULL) {
             memory_error(parser);
             return;
diff --git a/py/qstr.c b/py/qstr.c
index b1ae6ac74..dd04ae8f7 100644
--- a/py/qstr.c
+++ b/py/qstr.c
@@ -92,6 +92,7 @@ STATIC const qstr_pool_t const_pool = {
 
 void qstr_init(void) {
     MP_STATE_VM(last_pool) = (qstr_pool_t*)&const_pool; // we won't modify the const_pool since it has no allocated room left
+    MP_STATE_VM(qstr_last_chunk) = NULL;
 }
 
 STATIC const byte *find_qstr(qstr q) {
@@ -152,8 +153,46 @@ qstr qstr_from_strn(const char *str, mp_uint_t len) {
     assert(len < (1 << (8 * MICROPY_QSTR_BYTES_IN_LEN)));
     qstr q = qstr_find_strn(str, len);
     if (q == 0) {
+        // qstr does not exist in interned pool so need to add it
+
+        // compute number of bytes needed to intern this string
+        mp_uint_t n_bytes = 2 + MICROPY_QSTR_BYTES_IN_LEN + len + 1;
+
+        if (MP_STATE_VM(qstr_last_chunk) != NULL && MP_STATE_VM(qstr_last_used) + n_bytes > MP_STATE_VM(qstr_last_alloc)) {
+            // not enough room at end of previously interned string so try to grow
+            byte *new_p = m_renew_maybe(byte, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_alloc) + n_bytes, false);
+            if (new_p == NULL) {
+                // could not grow existing memory; shrink it to fit previous
+                (void)m_renew(byte, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_used));
+                MP_STATE_VM(qstr_last_chunk) = NULL;
+            } else {
+                // could grow existing memory
+                MP_STATE_VM(qstr_last_alloc) += n_bytes;
+            }
+        }
+
+        if (MP_STATE_VM(qstr_last_chunk) == NULL) {
+            // no existing memory for the interned string so allocate a new chunk
+            mp_uint_t al = n_bytes;
+            if (al < MICROPY_ALLOC_QSTR_CHUNK_INIT) {
+                al = MICROPY_ALLOC_QSTR_CHUNK_INIT;
+            }
+            MP_STATE_VM(qstr_last_chunk) = m_new_maybe(byte, al);
+            if (MP_STATE_VM(qstr_last_chunk) == NULL) {
+                // failed to allocate a large chunk so try with exact size
+                MP_STATE_VM(qstr_last_chunk) = m_new(byte, n_bytes);
+                al = n_bytes;
+            }
+            MP_STATE_VM(qstr_last_alloc) = al;
+            MP_STATE_VM(qstr_last_used) = 0;
+        }
+
+        // allocate memory from the chunk for this new interned string's data
+        byte *q_ptr = MP_STATE_VM(qstr_last_chunk) + MP_STATE_VM(qstr_last_used);
+        MP_STATE_VM(qstr_last_used) += n_bytes;
+
+        // store the interned strings' data
         mp_uint_t hash = qstr_compute_hash((const byte*)str, len);
-        byte *q_ptr = m_new(byte, 2 + MICROPY_QSTR_BYTES_IN_LEN + len + 1);
         q_ptr[0] = hash;
         q_ptr[1] = hash >> 8;
         Q_SET_LENGTH(q_ptr, len);
-- 
GitLab