3 files changed, 51 insertions, 10 deletions
diff --git a/Source/SPParserUtils.c b/Source/SPParserUtils.c
index b3b48945..69929cee 100644
--- a/Source/SPParserUtils.c
+++ b/Source/SPParserUtils.c
@@ -31,18 +31,36 @@
 #include "SPParserUtils.h"
 #include <stdint.h>
 
+#define SIZET (sizeof(size_t))
+#define SIZET1 (SIZET - 1)
+#define SBYTE (SIZET1 * 8)
+
 #define ONEMASK ((size_t)(-1) / 0xFF)
+#define ONEMASK8 (ONEMASK * 0x80)
+#define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1)
 
 // adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
 size_t utf8strlen(const char * _s)
 {
+	
+	/* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2}
+	 "correct" the variable 'count' by subtraction the number
+	 of occurrences of the start byte 0xF0 (4-byte UTF-8 char).
+	 Here we assume that only up to 4-byte UTF-8 chars
+	 are allowed [latest UTF-8 specification].
+	 
+	 Marked in the source code by "CORRECT".
+	 */
+	
 	const char * s;
-	size_t count = 0;
-	size_t u;
+	long count = 0;
+	size_t u = 0;
+	size_t u1 = 0;
 	unsigned char b;
 	
+	
 	/* Handle any initial misaligned bytes. */
-	for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
+	for (s = _s; (uintptr_t)(s) & SIZET1; s++) {
 		b = *s;
 		
 		/* Exit if we hit a zero byte. */
@@ -51,10 +69,13 @@ size_t utf8strlen(const char * _s)
 		
 		/* Is this byte NOT the first byte of a character? */
 		count += (b >> 7) & ((~b) >> 6);
+		
+		/* CORRECT */
+		count -= (b & 0xf0) == 0xf0;
 	}
 	
 	/* Handle complete blocks. */
-	for (; ; s += sizeof(size_t)) {
+	for (; ; s += SIZET) {
 		/* Prefetch 256 bytes ahead. */
 		__builtin_prefetch(&s[256], 0, 0);
 		
@@ -62,12 +83,19 @@ size_t utf8strlen(const char * _s)
 		u = *(size_t *)(s);
 		
 		/* Exit the loop if there are any zero bytes. */
-		if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
+		if ((u - ONEMASK) & (~u) & ONEMASK8)
 			break;
 		
+		/* CORRECT */
+		u1 = u & FMASK;
+		u1 = (u1 >> 7) & (u1 >> 6) & (u1 >> 5) & (u1 >> 4);
+		if (u1) count -= (u1 * ONEMASK) >> SBYTE;
+		
 		/* Count bytes which are NOT the first byte of a character. */
-		u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
-		count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
+		u = ((u & ONEMASK8) >> 7) & ((~u) >> 6);
+		
+		count += (u * ONEMASK) >> SBYTE;
+		
 	}
 	
 	/* Take care of any left-over bytes. */
@@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s)
 		
 		/* Is this byte NOT the first byte of a character? */
 		count += (b >> 7) & ((~b) >> 6);
+		
+		/* CORRECT */
+		count -= (b & 0xf0) == 0xf0;
 	}
 	
 done:
diff --git a/UnitTests/SPParserUtilsTest.m b/UnitTests/SPParserUtilsTest.m
index 5066f8f7..994b166a 100644
--- a/UnitTests/SPParserUtilsTest.m
+++ b/UnitTests/SPParserUtilsTest.m
@@ -44,6 +44,9 @@
 @implementation SPParserUtilsTest
 
 - (void)testUtf8strlen {
+	// NOTE!!: Those test do not verify that the utf8strlen() function works according to spec,
+	//         but whether it produces the same results as NSString for the same input.
+	
 	const char *empty = "";
 	NSString *emptyString = [NSString stringWithCString:empty encoding:NSUTF8StringEncoding];
 	STAssertEquals(utf8strlen(empty),[emptyString length], @"empty string");
@@ -52,8 +55,8 @@
 	// If any of those conditions fail, all of the following assumptions are moot.
 	const char *charSeq = "\xF0\x9F\x8D\x8F"; //🍏
 	NSString *charString = [NSString stringWithCString:charSeq encoding:NSUTF8StringEncoding];
-	STAssertEquals(strlen(charSeq),     4, @"assumption about storage for binary C string");
-	STAssertEquals([charString length], 2, @"assumption about NSString internal storage of string");
+	STAssertEquals(strlen(charSeq),     (size_t)4, @"assumption about storage for binary C string");
+	STAssertEquals([charString length], (NSUInteger)2, @"assumption about NSString internal storage of string");
 	
 	const char *singleByteSeq = "Hello World!";
 	NSString *singleByteString = [NSString stringWithCString:singleByteSeq encoding:NSUTF8StringEncoding];
diff --git a/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme b/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme
index e6b89ad7..579336f6 100644
--- a/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme
+++ b/sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme
@@ -25,7 +25,7 @@
    <TestAction
       selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
       selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES"
+      shouldUseLaunchSchemeArgsEnv = "NO"
       buildConfiguration = "Debug">
       <Testables>
          <TestableReference
@@ -48,6 +48,13 @@
             ReferencedContainer = "container:sequel-pro.xcodeproj">
          </BuildableReference>
       </MacroExpansion>
+      <EnvironmentVariables>
+         <EnvironmentVariable
+            key = "OBJC_DISABLE_GC"
+            value = "YES"
+            isEnabled = "YES">
+         </EnvironmentVariable>
+      </EnvironmentVariables>
    </TestAction>
    <LaunchAction
       selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"