aboutsummaryrefslogtreecommitdiffstats
path: root/Source
diff options
context:
space:
mode:
Diffstat (limited to 'Source')
-rw-r--r--Source/SPParserUtils.c45
1 files changed, 38 insertions, 7 deletions
diff --git a/Source/SPParserUtils.c b/Source/SPParserUtils.c
index b3b48945..69929cee 100644
--- a/Source/SPParserUtils.c
+++ b/Source/SPParserUtils.c
@@ -31,18 +31,36 @@
#include "SPParserUtils.h"
#include <stdint.h>
+#define SIZET (sizeof(size_t))
+#define SIZET1 (SIZET - 1)
+#define SBYTE (SIZET1 * 8)
+
#define ONEMASK ((size_t)(-1) / 0xFF)
+#define ONEMASK8 (ONEMASK * 0x80)
+#define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1)
// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
size_t utf8strlen(const char * _s)
{
+
+ /* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2}
+ "correct" the variable 'count' by subtraction the number
+ of occurrences of the start byte 0xF0 (4-byte UTF-8 char).
+ Here we assume that only up to 4-byte UTF-8 chars
+ are allowed [latest UTF-8 specification].
+
+ Marked in the source code by "CORRECT".
+ */
+
const char * s;
- size_t count = 0;
- size_t u;
+ long count = 0;
+ size_t u = 0;
+ size_t u1 = 0;
unsigned char b;
+
/* Handle any initial misaligned bytes. */
- for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
+ for (s = _s; (uintptr_t)(s) & SIZET1; s++) {
b = *s;
/* Exit if we hit a zero byte. */
@@ -51,10 +69,13 @@ size_t utf8strlen(const char * _s)
/* Is this byte NOT the first byte of a character? */
count += (b >> 7) & ((~b) >> 6);
+
+ /* CORRECT */
+ count -= (b & 0xf0) == 0xf0;
}
/* Handle complete blocks. */
- for (; ; s += sizeof(size_t)) {
+ for (; ; s += SIZET) {
/* Prefetch 256 bytes ahead. */
__builtin_prefetch(&s[256], 0, 0);
@@ -62,12 +83,19 @@ size_t utf8strlen(const char * _s)
u = *(size_t *)(s);
/* Exit the loop if there are any zero bytes. */
- if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
+ if ((u - ONEMASK) & (~u) & ONEMASK8)
break;
+ /* CORRECT */
+ u1 = u & FMASK;
+ u1 = (u1 >> 7) & (u1 >> 6) & (u1 >> 5) & (u1 >> 4);
+ if (u1) count -= (u1 * ONEMASK) >> SBYTE;
+
/* Count bytes which are NOT the first byte of a character. */
- u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
- count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
+ u = ((u & ONEMASK8) >> 7) & ((~u) >> 6);
+
+ count += (u * ONEMASK) >> SBYTE;
+
}
/* Take care of any left-over bytes. */
@@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s)
/* Is this byte NOT the first byte of a character? */
count += (b >> 7) & ((~b) >> 6);
+
+ /* CORRECT */
+ count -= (b & 0xf0) == 0xf0;
}
done: