diff options
-rw-r--r-- | Source/SPParserUtils.c | 45 | ||||
-rw-r--r-- | UnitTests/SPParserUtilsTest.m | 7 | ||||
-rw-r--r-- | sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme | 9 |
3 files changed, 51 insertions, 10 deletions
diff --git a/Source/SPParserUtils.c b/Source/SPParserUtils.c index b3b48945..69929cee 100644 --- a/Source/SPParserUtils.c +++ b/Source/SPParserUtils.c @@ -31,18 +31,36 @@ #include "SPParserUtils.h" #include <stdint.h> +#define SIZET (sizeof(size_t)) +#define SIZET1 (SIZET - 1) +#define SBYTE (SIZET1 * 8) + #define ONEMASK ((size_t)(-1) / 0xFF) +#define ONEMASK8 (ONEMASK * 0x80) +#define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1) // adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html size_t utf8strlen(const char * _s) { + + /* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2} + "correct" the variable 'count' by subtraction the number + of occurrences of the start byte 0xF0 (4-byte UTF-8 char). + Here we assume that only up to 4-byte UTF-8 chars + are allowed [latest UTF-8 specification]. + + Marked in the source code by "CORRECT". + */ + const char * s; - size_t count = 0; - size_t u; + long count = 0; + size_t u = 0; + size_t u1 = 0; unsigned char b; + /* Handle any initial misaligned bytes. */ - for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) { + for (s = _s; (uintptr_t)(s) & SIZET1; s++) { b = *s; /* Exit if we hit a zero byte. */ @@ -51,10 +69,13 @@ size_t utf8strlen(const char * _s) /* Is this byte NOT the first byte of a character? */ count += (b >> 7) & ((~b) >> 6); + + /* CORRECT */ + count -= (b & 0xf0) == 0xf0; } /* Handle complete blocks. */ - for (; ; s += sizeof(size_t)) { + for (; ; s += SIZET) { /* Prefetch 256 bytes ahead. */ __builtin_prefetch(&s[256], 0, 0); @@ -62,12 +83,19 @@ size_t utf8strlen(const char * _s) u = *(size_t *)(s); /* Exit the loop if there are any zero bytes. */ - if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80)) + if ((u - ONEMASK) & (~u) & ONEMASK8) break; + /* CORRECT */ + u1 = u & FMASK; + u1 = (u1 >> 7) & (u1 >> 6) & (u1 >> 5) & (u1 >> 4); + if (u1) count -= (u1 * ONEMASK) >> SBYTE; + /* Count bytes which are NOT the first byte of a character. */ - u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6); - count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8); + u = ((u & ONEMASK8) >> 7) & ((~u) >> 6); + + count += (u * ONEMASK) >> SBYTE; + } /* Take care of any left-over bytes. */ @@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s) /* Is this byte NOT the first byte of a character? */ count += (b >> 7) & ((~b) >> 6); + + /* CORRECT */ + count -= (b & 0xf0) == 0xf0; } done: diff --git a/UnitTests/SPParserUtilsTest.m b/UnitTests/SPParserUtilsTest.m index 5066f8f7..994b166a 100644 --- a/UnitTests/SPParserUtilsTest.m +++ b/UnitTests/SPParserUtilsTest.m @@ -44,6 +44,9 @@ @implementation SPParserUtilsTest - (void)testUtf8strlen { + // NOTE!!: Those test do not verify that the utf8strlen() function works according to spec, + // but whether it produces the same results as NSString for the same input. + const char *empty = ""; NSString *emptyString = [NSString stringWithCString:empty encoding:NSUTF8StringEncoding]; STAssertEquals(utf8strlen(empty),[emptyString length], @"empty string"); @@ -52,8 +55,8 @@ // If any of those conditions fail, all of the following assumptions are moot. const char *charSeq = "\xF0\x9F\x8D\x8F"; //🍏 NSString *charString = [NSString stringWithCString:charSeq encoding:NSUTF8StringEncoding]; - STAssertEquals(strlen(charSeq), 4, @"assumption about storage for binary C string"); - STAssertEquals([charString length], 2, @"assumption about NSString internal storage of string"); + STAssertEquals(strlen(charSeq), (size_t)4, @"assumption about storage for binary C string"); + STAssertEquals([charString length], (NSUInteger)2, @"assumption about NSString internal storage of string"); const char *singleByteSeq = "Hello World!"; NSString *singleByteString = [NSString stringWithCString:singleByteSeq encoding:NSUTF8StringEncoding]; diff --git a/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme b/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme index e6b89ad7..579336f6 100644 --- a/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme +++ b/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme @@ -25,7 +25,7 @@ <TestAction selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB" selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB" - shouldUseLaunchSchemeArgsEnv = "YES" + shouldUseLaunchSchemeArgsEnv = "NO" buildConfiguration = "Debug"> <Testables> <TestableReference @@ -48,6 +48,13 @@ ReferencedContainer = "container:sequel-pro.xcodeproj"> </BuildableReference> </MacroExpansion> + <EnvironmentVariables> + <EnvironmentVariable + key = "OBJC_DISABLE_GC" + value = "YES" + isEnabled = "YES"> + </EnvironmentVariable> + </EnvironmentVariables> </TestAction> <LaunchAction selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB" |