From cf2559d98a00699462bc32f3e19753a9ca547a9c Mon Sep 17 00:00:00 2001
From: Max <post@wickenrode.com>
Date: Tue, 27 Jan 2015 01:55:18 +0100
Subject: Move some duplicate code into it's own file

---
 Source/SPSQLTokenizer.l | 59 +++----------------------------------------------
 1 file changed, 3 insertions(+), 56 deletions(-)

(limited to 'Source/SPSQLTokenizer.l')
diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l
index 24dac938..b9170edb 100644
--- a/Source/SPSQLTokenizer.l
+++ b/Source/SPSQLTokenizer.l
@@ -31,12 +31,12 @@
 //  More info at <https://github.com/sequelpro/sequelpro>
 
 #import "SPSQLTokenizer.h"
+#include "SPParserUtils.h"
 
-int utf8strlenfortoken(const char * _s);
-int yyuoffset, yyuleng;
+size_t yyuoffset, yyuleng;
 
 //keep track of the current utf-8 character (not byte) offset and token length
-#define YY_USER_ACTION { yyuoffset += yyuleng; yyuleng = utf8strlenfortoken(yytext); }
+#define YY_USER_ACTION { yyuoffset += yyuleng; yyuleng = utf8strlen(yytext); }
 //ignore the output of unmatched characters
 #define ECHO {}
 %}
@@ -90,56 +90,3 @@ compend		{s}"end"
             						return 0;
           						}
 %%
-#define ONEMASK ((size_t)(-1) / 0xFF)
-// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
-int utf8strlenfortoken(const char * _s)
-{
-	const char * s;
-	size_t count = 0;
-	size_t u;
-	unsigned char b;
-
-	/* Handle any initial misaligned bytes. */
-	for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
-		b = *s;
-
-		/* Exit if we hit a zero byte. */
-		if (b == '\0')
-			goto done;
-
-		/* Is this byte NOT the first byte of a character? */
-		count += (b >> 7) & ((~b) >> 6);
-	}
-
-	/* Handle complete blocks. */
-	for (; ; s += sizeof(size_t)) {
-		/* Prefetch 256 bytes ahead. */
-		__builtin_prefetch(&s[256], 0, 0);
-
-		/* Grab 4 or 8 bytes of UTF-8 data. */
-		u = *(size_t *)(s);
-
-		/* Exit the loop if there are any zero bytes. */
-		if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
-			break;
-
-		/* Count bytes which are NOT the first byte of a character. */
-		u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
-		count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
-	}
-
-	/* Take care of any left-over bytes. */
-	for (; ; s++) {
-		b = *s;
-
-		/* Exit if we hit a zero byte. */
-		if (b == '\0')
-			break;
-
-		/* Is this byte NOT the first byte of a character? */
-		count += (b >> 7) & ((~b) >> 6);
-	}
-
-done:
-	return (int)((s - _s) - count);
-}
-- 
cgit v1.2.3


From c25bb060a197deecc79dac86e24df8246a87db71 Mon Sep 17 00:00:00 2001
From: Max <post@wickenrode.com>
Date: Sun, 1 Feb 2015 18:36:22 +0100
Subject: Update lexer to reflect that backticks can actually escape themselves
 (won't cause visible changes)

---
 Source/SPSQLTokenizer.l | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Source/SPSQLTokenizer.l')

diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l
index b9170edb..f33d373d 100644
--- a/Source/SPSQLTokenizer.l
+++ b/Source/SPSQLTokenizer.l
@@ -63,7 +63,7 @@ compend		{s}"end"
 
 \"([^"\\]|\\(.|[\n\r]))*\"?			{ ; }
 '([^'\\]|\\(.|[\n\r]))*'?			{ ; }
-`[^`]*`?							{ ; }
+`(``|[^`])*`?						{ ; }
 
 "/*"								{ BEGIN(comment); }
 <comment>[^*]* 						{ ; }
-- 
cgit v1.2.3


From 60a5d64518f09af80059e2e293849ff6b0d6be75 Mon Sep 17 00:00:00 2001
From: Max <post@wickenrode.com>
Date: Sun, 1 Feb 2015 20:19:52 +0100
Subject: Change lexer definition of high bytes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

flex does not(*) support UTF-8, therefore
alpha	[a-z_\.À-ﾟ]
has always been interpreted by flex as
alpha	[a-z_\.\xC3\x80-\xEF\xBE\x9F]

I assume this is not what was indetend and the only reason it worked, is because C3 (195),BE (190) and 9F (159) are already covered by 80-EF (128-239). Incidentally this range would also cover the whole Unicode BMP in UTF8.
This change should make it more obvious.

(*) There were some patches in 2012 and 2014 but they don't seem to have been merged.
---
 Source/SPSQLTokenizer.l | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Source/SPSQLTokenizer.l')

diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l
index f33d373d..5b08f312 100644
--- a/Source/SPSQLTokenizer.l
+++ b/Source/SPSQLTokenizer.l
@@ -50,7 +50,7 @@ size_t yyuoffset, yyuleng;
 s			[ \t\n\r]
 dkey		"delimiter"
 scol		";"
-dval		[!-ﾟ]
+dval		[!-\x7E\x80-\xEF]
 compstart	"begin"{s}
 compend		{s}"end"
 %x comment
-- 
cgit v1.2.3