From cf2559d98a00699462bc32f3e19753a9ca547a9c Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 27 Jan 2015 01:55:18 +0100 Subject: Move some duplicate code into it's own file --- Source/SPSQLTokenizer.l | 59 +++---------------------------------------------- 1 file changed, 3 insertions(+), 56 deletions(-) (limited to 'Source/SPSQLTokenizer.l') diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l index 24dac938..b9170edb 100644 --- a/Source/SPSQLTokenizer.l +++ b/Source/SPSQLTokenizer.l @@ -31,12 +31,12 @@ // More info at #import "SPSQLTokenizer.h" +#include "SPParserUtils.h" -int utf8strlenfortoken(const char * _s); -int yyuoffset, yyuleng; +size_t yyuoffset, yyuleng; //keep track of the current utf-8 character (not byte) offset and token length -#define YY_USER_ACTION { yyuoffset += yyuleng; yyuleng = utf8strlenfortoken(yytext); } +#define YY_USER_ACTION { yyuoffset += yyuleng; yyuleng = utf8strlen(yytext); } //ignore the output of unmatched characters #define ECHO {} %} @@ -90,56 +90,3 @@ compend {s}"end" return 0; } %% -#define ONEMASK ((size_t)(-1) / 0xFF) -// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html -int utf8strlenfortoken(const char * _s) -{ - const char * s; - size_t count = 0; - size_t u; - unsigned char b; - - /* Handle any initial misaligned bytes. */ - for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) { - b = *s; - - /* Exit if we hit a zero byte. */ - if (b == '\0') - goto done; - - /* Is this byte NOT the first byte of a character? */ - count += (b >> 7) & ((~b) >> 6); - } - - /* Handle complete blocks. */ - for (; ; s += sizeof(size_t)) { - /* Prefetch 256 bytes ahead. */ - __builtin_prefetch(&s[256], 0, 0); - - /* Grab 4 or 8 bytes of UTF-8 data. */ - u = *(size_t *)(s); - - /* Exit the loop if there are any zero bytes. */ - if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80)) - break; - - /* Count bytes which are NOT the first byte of a character. */ - u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6); - count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8); - } - - /* Take care of any left-over bytes. */ - for (; ; s++) { - b = *s; - - /* Exit if we hit a zero byte. */ - if (b == '\0') - break; - - /* Is this byte NOT the first byte of a character? */ - count += (b >> 7) & ((~b) >> 6); - } - -done: - return (int)((s - _s) - count); -} -- cgit v1.2.3 From c25bb060a197deecc79dac86e24df8246a87db71 Mon Sep 17 00:00:00 2001 From: Max Date: Sun, 1 Feb 2015 18:36:22 +0100 Subject: Update lexer to reflect that backticks can actually escape themselves (won't cause visible changes) --- Source/SPSQLTokenizer.l | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Source/SPSQLTokenizer.l') diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l index b9170edb..f33d373d 100644 --- a/Source/SPSQLTokenizer.l +++ b/Source/SPSQLTokenizer.l @@ -63,7 +63,7 @@ compend {s}"end" \"([^"\\]|\\(.|[\n\r]))*\"? { ; } '([^'\\]|\\(.|[\n\r]))*'? { ; } -`[^`]*`? { ; } +`(``|[^`])*`? { ; } "/*" { BEGIN(comment); } [^*]* { ; } -- cgit v1.2.3 From 60a5d64518f09af80059e2e293849ff6b0d6be75 Mon Sep 17 00:00:00 2001 From: Max Date: Sun, 1 Feb 2015 20:19:52 +0100 Subject: Change lexer definition of high bytes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flex does not(*) support UTF-8, therefore alpha [a-z_\.À-゚] has always been interpreted by flex as alpha [a-z_\.\xC3\x80-\xEF\xBE\x9F] I assume this is not what was indetend and the only reason it worked, is because C3 (195),BE (190) and 9F (159) are already covered by 80-EF (128-239). Incidentally this range would also cover the whole Unicode BMP in UTF8. This change should make it more obvious. (*) There were some patches in 2012 and 2014 but they don't seem to have been merged. --- Source/SPSQLTokenizer.l | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Source/SPSQLTokenizer.l') diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l index f33d373d..5b08f312 100644 --- a/Source/SPSQLTokenizer.l +++ b/Source/SPSQLTokenizer.l @@ -50,7 +50,7 @@ size_t yyuoffset, yyuleng; s [ \t\n\r] dkey "delimiter" scol ";" -dval [!-゚] +dval [!-\x7E\x80-\xEF] compstart "begin"{s} compend {s}"end" %x comment -- cgit v1.2.3