diff options
Diffstat (limited to 'Source')
-rw-r--r-- | Source/SPParserUtils.c | 45 |
1 files changed, 38 insertions, 7 deletions
diff --git a/Source/SPParserUtils.c b/Source/SPParserUtils.c index b3b48945..69929cee 100644 --- a/Source/SPParserUtils.c +++ b/Source/SPParserUtils.c @@ -31,18 +31,36 @@ #include "SPParserUtils.h" #include <stdint.h> +#define SIZET (sizeof(size_t)) +#define SIZET1 (SIZET - 1) +#define SBYTE (SIZET1 * 8) + #define ONEMASK ((size_t)(-1) / 0xFF) +#define ONEMASK8 (ONEMASK * 0x80) +#define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1) // adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html size_t utf8strlen(const char * _s) { + + /* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2} + "correct" the variable 'count' by subtraction the number + of occurrences of the start byte 0xF0 (4-byte UTF-8 char). + Here we assume that only up to 4-byte UTF-8 chars + are allowed [latest UTF-8 specification]. + + Marked in the source code by "CORRECT". + */ + const char * s; - size_t count = 0; - size_t u; + long count = 0; + size_t u = 0; + size_t u1 = 0; unsigned char b; + /* Handle any initial misaligned bytes. */ - for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) { + for (s = _s; (uintptr_t)(s) & SIZET1; s++) { b = *s; /* Exit if we hit a zero byte. */ @@ -51,10 +69,13 @@ size_t utf8strlen(const char * _s) /* Is this byte NOT the first byte of a character? */ count += (b >> 7) & ((~b) >> 6); + + /* CORRECT */ + count -= (b & 0xf0) == 0xf0; } /* Handle complete blocks. */ - for (; ; s += sizeof(size_t)) { + for (; ; s += SIZET) { /* Prefetch 256 bytes ahead. */ __builtin_prefetch(&s[256], 0, 0); @@ -62,12 +83,19 @@ size_t utf8strlen(const char * _s) u = *(size_t *)(s); /* Exit the loop if there are any zero bytes. */ - if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80)) + if ((u - ONEMASK) & (~u) & ONEMASK8) break; + /* CORRECT */ + u1 = u & FMASK; + u1 = (u1 >> 7) & (u1 >> 6) & (u1 >> 5) & (u1 >> 4); + if (u1) count -= (u1 * ONEMASK) >> SBYTE; + /* Count bytes which are NOT the first byte of a character. */ - u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6); - count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8); + u = ((u & ONEMASK8) >> 7) & ((~u) >> 6); + + count += (u * ONEMASK) >> SBYTE; + } /* Take care of any left-over bytes. */ @@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s) /* Is this byte NOT the first byte of a character? */ count += (b >> 7) & ((~b) >> 6); + + /* CORRECT */ + count -= (b & 0xf0) == 0xf0; } done: |