From e3cfc0d33d04a33b6f546f8018991c34768a871d Mon Sep 17 00:00:00 2001
From: Paul Sokolovsky <pfalcon@users.sourceforge.net>
Date: Sat, 14 Jun 2014 06:06:36 +0300
Subject: [PATCH] objstr: Refactor to work with char pointers instead of
 indexes.

In preparation for unicode support.
---
 py/objstr.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/py/objstr.c b/py/objstr.c
index f9cc27344..c84d7c900 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -352,6 +352,12 @@ uncomparable:
     return MP_OBJ_NULL; // op not supported
 }
 
+const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len,
+                             mp_obj_t index, bool is_slice) {
+    machine_uint_t index_val = mp_get_index(type, self_len, index, is_slice);
+    return self_data + index_val;
+}
+
 STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
     mp_obj_type_t *type = mp_obj_get_type(self_in);
     GET_STR_DATA_LEN(self_in, self_data, self_len);
@@ -367,11 +373,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
             return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
         }
 #endif
-        uint index_val = mp_get_index(type, self_len, index, false);
+        const byte *p = str_index_to_ptr(type, self_data, self_len, index, false);
         if (type == &mp_type_bytes) {
-            return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
+            return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)*p);
         } else {
-            return mp_obj_new_str((char*)self_data + index_val, 1, true);
+            return mp_obj_new_str((char*)p, 1, true);
         }
     } else {
         return MP_OBJ_NULL; // op not supported
@@ -567,6 +573,7 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) {
 
 
 STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) {
+    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
     assert(2 <= n_args && n_args <= 4);
     assert(MP_OBJ_IS_STR(args[0]));
     assert(MP_OBJ_IS_STR(args[1]));
@@ -574,16 +581,16 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire
     GET_STR_DATA_LEN(args[0], haystack, haystack_len);
     GET_STR_DATA_LEN(args[1], needle, needle_len);
 
-    machine_uint_t start = 0;
-    machine_uint_t end = haystack_len;
+    const byte *start = haystack;
+    const byte *end = haystack + haystack_len;
     if (n_args >= 3 && args[2] != mp_const_none) {
-        start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
+        start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
     }
     if (n_args >= 4 && args[3] != mp_const_none) {
-        end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
+        end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
     }
 
-    const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction);
+    const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
     if (p == NULL) {
         // not found
         if (is_index) {
@@ -615,16 +622,17 @@ STATIC mp_obj_t str_rindex(uint n_args, const mp_obj_t *args) {
 
 // TODO: (Much) more variety in args
 STATIC mp_obj_t str_startswith(uint n_args, const mp_obj_t *args) {
+    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
     GET_STR_DATA_LEN(args[0], str, str_len);
     GET_STR_DATA_LEN(args[1], prefix, prefix_len);
-    uint index_val = 0;
+    const byte *start = str;
     if (n_args > 2) {
-        index_val = mp_get_index(&mp_type_str, str_len, args[2], true);
+        start = str_index_to_ptr(self_type, str, str_len, args[2], true);
     }
-    if (prefix_len + index_val > str_len) {
+    if (prefix_len + (start - str) > str_len) {
         return mp_const_false;
     }
-    return MP_BOOL(memcmp(str + index_val, prefix, prefix_len) == 0);
+    return MP_BOOL(memcmp(start, prefix, prefix_len) == 0);
 }
 
 STATIC mp_obj_t str_endswith(uint n_args, const mp_obj_t *args) {
@@ -1422,6 +1430,7 @@ STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
 }
 
 STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
+    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
     assert(2 <= n_args && n_args <= 4);
     assert(MP_OBJ_IS_STR(args[0]));
     assert(MP_OBJ_IS_STR(args[1]));
@@ -1429,26 +1438,28 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
     GET_STR_DATA_LEN(args[0], haystack, haystack_len);
     GET_STR_DATA_LEN(args[1], needle, needle_len);
 
-    machine_uint_t start = 0;
-    machine_uint_t end = haystack_len;
+    const byte *start = haystack;
+    const byte *end = haystack + haystack_len;
     if (n_args >= 3 && args[2] != mp_const_none) {
-        start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
+        start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
     }
     if (n_args >= 4 && args[3] != mp_const_none) {
-        end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
+        end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
     }
 
     // if needle_len is zero then we count each gap between characters as an occurrence
     if (needle_len == 0) {
-        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
+        return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1);
     }
 
     // count the occurrences
     machine_int_t num_occurrences = 0;
-    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
-        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
+    for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) {
+        if (memcmp(haystack_ptr, needle, needle_len) == 0) {
             num_occurrences++;
-            haystack_index += needle_len - 1;
+            haystack_ptr += needle_len;
+        } else {
+            haystack_ptr = utf8_next_char(haystack_ptr);
         }
     }
 
-- 
GitLab