Skip to content
Snippets Groups Projects
Commit 1694bc73 authored by Damien George's avatar Damien George
Browse files

py: Add stream reading of n unicode chars; unicode support by default.

With unicode enabled, this patch allows reading a fixed number of
characters from text-mode streams; eg file.read(5) will read 5 unicode
chars, which can made of more than 5 bytes.

For an ASCII stream (ie no chars > 127) it only needs to do 1 read.  If
there are lots of non-ASCII chars in a stream, then it needs multiple
reads of the underlying object.

Adds a new test for this case.  Enables unicode support by default on
unix and stmhal ports.
parent 02bc882c
No related branches found
No related tags found
No related merge requests found
......@@ -67,6 +67,9 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
nlr_raise(mp_obj_new_exception_msg(&mp_type_OSError, "Operation not supported"));
}
// What to do if sz < -1? Python docs don't specify this case.
// CPython does a readall, but here we silently let negatives through,
// and they will cause a MemoryError.
mp_int_t sz;
if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) {
return stream_readall(args[0]);
......@@ -74,7 +77,90 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
#if MICROPY_PY_BUILTINS_STR_UNICODE
if (!o->type->stream_p->is_bytes) {
mp_not_implemented("Reading from unicode text streams by character count");
// We need to read sz number of unicode characters. Because we don't have any
// buffering, and because the stream API can only read bytes, we must read here
// in units of bytes and must never over read. If we want sz chars, then reading
// sz bytes will never over-read, so we follow this approach, in a loop to keep
// reading until we have exactly enough chars. This will be 1 read for text
// with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII
// chars. For text with lots of non-ASCII chars, it'll be pretty inefficient
// in time and memory.
vstr_t vstr;
vstr_init(&vstr, sz);
mp_uint_t more_bytes = sz;
mp_uint_t last_buf_offset = 0;
while (more_bytes > 0) {
char *p = vstr_add_len(&vstr, more_bytes);
if (p == NULL) {
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_MemoryError, "out of memory"));
}
int error;
mp_int_t out_sz = o->type->stream_p->read(o, p, more_bytes, &error);
if (out_sz == -1) {
vstr_cut_tail_bytes(&vstr, more_bytes);
if (is_nonblocking_error(error)) {
// With non-blocking streams, we read as much as we can.
// If we read nothing, return None, just like read().
// Otherwise, return data read so far.
// TODO what if we have read only half a non-ASCII char?
if (vstr.len == 0) {
vstr_clear(&vstr);
return mp_const_none;
}
break;
}
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_OSError, "[Errno %d]", error));
}
if (out_sz == 0) {
// Finish reading.
// TODO what if we have read only half a non-ASCII char?
vstr_cut_tail_bytes(&vstr, more_bytes);
break;
}
// count chars from bytes just read
for (mp_uint_t off = last_buf_offset;;) {
byte b = vstr.buf[off];
int n;
if (!UTF8_IS_NONASCII(b)) {
// 1-byte ASCII char
n = 1;
} else if ((b & 0xe0) == 0xc0) {
// 2-byte char
n = 2;
} else if ((b & 0xf0) == 0xe0) {
// 3-byte char
n = 3;
} else if ((b & 0xf8) == 0xf0) {
// 4-byte char
n = 4;
} else {
// TODO
n = 5;
}
if (off + n <= vstr.len) {
// got a whole char in n bytes
off += n;
sz -= 1;
last_buf_offset = off;
if (off >= vstr.len) {
more_bytes = sz;
break;
}
} else {
// didn't get a whole char, so work out how many extra bytes are needed for
// this partial char, plus bytes for additional chars that we want
more_bytes = (off + n - vstr.len) + (sz - 1);
break;
}
}
}
mp_obj_t ret = mp_obj_new_str_of_type(&mp_type_str, (byte*)vstr.buf, vstr.len);
vstr_clear(&vstr);
return ret;
}
#endif
......
......@@ -44,7 +44,7 @@
*/
#define MICROPY_ENABLE_LFN (1)
#define MICROPY_LFN_CODE_PAGE (437) /* 1=SFN/ANSI 437=LFN/U.S.(OEM) */
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
#define MICROPY_PY_BUILTINS_FROZENSET (1)
#define MICROPY_PY_SYS_EXIT (1)
#define MICROPY_PY_SYS_STDFILES (1)
......
......@@ -134,7 +134,7 @@ def main():
if args.test_dirs is None:
if pyb is None:
# run PC tests
test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc')
test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc', 'unicode')
else:
# run pyboard tests
test_dirs = ('basics', 'micropython', 'float', 'pyb', 'pybnative', 'inlineasm')
......
aαbβcγdδ
# test reading a given number of characters
def do(mode):
f = open('unicode/data/utf-8_2.txt', mode)
print(f.read(1))
print(f.read(1))
print(f.read(2))
print(f.read(4))
f.close()
do('rb')
do('rt')
......@@ -43,7 +43,7 @@
#define MICROPY_LONGINT_IMPL (MICROPY_LONGINT_IMPL_MPZ)
#define MICROPY_STREAMS_NON_BLOCK (1)
#define MICROPY_OPT_COMPUTED_GOTO (1)
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
#define MICROPY_PY_BUILTINS_FROZENSET (1)
#define MICROPY_PY_SYS_EXIT (1)
#define MICROPY_PY_SYS_PLATFORM "linux"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment