From be4aac5c809f8f74c20f1d7b03a932e5ee0720df Mon Sep 17 00:00:00 2001
From: Bibiko <bibiko@eva.mpg.de>
Date: Thu, 14 May 2009 15:50:53 +0000
Subject: =?UTF-8?q?=E2=80=A2=20added=20SPSQLTokenizer=20-=20this=20is=20an?=
 =?UTF-8?q?=20approach=20to=20make=20usage=20of=20lex=20to=20split=20a=20s?=
 =?UTF-8?q?tring=20very=20fast=20into=20SQL=20queries=20considering=20the?=
 =?UTF-8?q?=20"delimiter"=20switch=20and=20compound-statements=20via=20CRE?=
 =?UTF-8?q?ATE=20...=20BEGIN=20...=20END;=20without=20using=20"delimiter"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Source/SPSQLTokenizer.h |  32 ++++++++++++
 Source/SPSQLTokenizer.l | 134 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 Source/SPSQLTokenizer.h
 create mode 100644 Source/SPSQLTokenizer.l

(limited to 'Source')

diff --git a/Source/SPSQLTokenizer.h b/Source/SPSQLTokenizer.h
new file mode 100644
index 00000000..7f459440
--- /dev/null
+++ b/Source/SPSQLTokenizer.h
@@ -0,0 +1,32 @@
+//
+//  SPSQLTokenizer.h
+//  sequel-pro
+//
+//  Created by Hans-J. Bibiko on May 14, 2009
+//
+//  This program is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation; either version 2 of the License, or
+//  (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+//  More info at <http://code.google.com/p/sequel-pro/>
+
+#define SP_SQL_TOKEN_DOUBLE_QUOTED_TEXT   1
+#define SP_SQL_TOKEN_SINGLE_QUOTED_TEXT   2
+#define SP_SQL_TOKEN_COMMENT              3
+#define SP_SQL_TOKEN_BACKTICK_QUOTED_TEXT 4
+#define SP_SQL_TOKEN_DELIM_START          5
+#define SP_SQL_TOKEN_DELIM_VALUE          6
+#define SP_SQL_TOKEN_DELIM_END            7
+#define SP_SQL_TOKEN_WHITESPACE           8
+#define SP_SQL_TOKEN_SEMICOLON            9
+#define SP_SQL_TOKEN_COMPOUND            10
diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l
new file mode 100644
index 00000000..95d0f76c
--- /dev/null
+++ b/Source/SPSQLTokenizer.l
@@ -0,0 +1,134 @@
+%{
+
+/*
+ *  SPSQLTokenizer.l
+ *  sequel-pro
+ *
+ *  Created by Hans-J. Bibiko on May 14, 2009
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *  More info at <http://code.google.com/p/sequel-pro/>
+ */
+
+#import "SPSQLTokenizer.h"
+int utf8strlenfortoken(const char * _s);
+int yyuoffset, yyuleng;
+
+#define YY_NO_UNPUT
+
+//keep track of the current utf-8 character (not byte) offset and token length
+#define YY_USER_ACTION { yyuoffset += yyuleng; yyuleng = utf8strlenfortoken(yytext); }
+%}
+%option prefix="to"
+%option noyywrap
+%option case-insensitive
+
+s		[ \t\n\r]
+dkey	"delimiter"
+scol    ";"
+dval	[!-ﾟ]
+compound	"create"(.|\n|\r)+?"begin"(.|\n|\r)+?{s}+"end"{s}*?{scol}
+%x comment
+%x delim
+%x delimbody
+
+%%
+
+\"([^"\\]|\\(.|[\n\r]))*\"?		{ return SP_SQL_TOKEN_DOUBLE_QUOTED_TEXT;   }
+'([^'\\]|\\(.|[\n\r]))*'?		{ return SP_SQL_TOKEN_SINGLE_QUOTED_TEXT;   }
+`[^`]*`?						{ return SP_SQL_TOKEN_BACKTICK_QUOTED_TEXT; }
+
+"/*"							{ BEGIN(comment); return SP_SQL_TOKEN_COMMENT; }
+<comment>[^*]* 				    { return SP_SQL_TOKEN_COMMENT; }
+<comment>"*"+                   { return SP_SQL_TOKEN_COMMENT; }
+<comment>"*"+"/" 				{ BEGIN(INITIAL); return SP_SQL_TOKEN_COMMENT; } 
+#[^\n\r]*(\n|\r)?			|
+--[ \t][^\n\r]*(\n|\r)?			{ return SP_SQL_TOKEN_COMMENT; }
+
+{s}+							{ return SP_SQL_TOKEN_WHITESPACE; }
+
+{dkey}{s}+						{ BEGIN(delim); return SP_SQL_TOKEN_DELIM_START; }
+<delim>{dval}+					{ BEGIN(delimbody); return SP_SQL_TOKEN_DELIM_VALUE; }
+<delimbody>{s}+{dkey}{s}+{scol}	{ BEGIN(INITIAL); return SP_SQL_TOKEN_DELIM_END; }
+
+{compound}						{ return SP_SQL_TOKEN_COMPOUND; }
+
+{scol}							{ return SP_SQL_TOKEN_SEMICOLON; }
+[.\r\n]							{ return SP_SQL_TOKEN_WHITESPACE; }
+
+
+
+
+
+<<EOF>>   						{
+                                    BEGIN(INITIAL);   /* make sure we return to initial state when finished! */
+            						yy_delete_buffer(YY_CURRENT_BUFFER);
+            						return 0;
+          						}
+%%
+#define ONEMASK ((size_t)(-1) / 0xFF)
+// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
+int utf8strlenfortoken(const char * _s)
+{
+	const char * s;
+	size_t count = 0;
+	size_t u;
+	unsigned char b;
+
+	/* Handle any initial misaligned bytes. */
+	for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
+		b = *s;
+
+		/* Exit if we hit a zero byte. */
+		if (b == '\0')
+			goto done;
+
+		/* Is this byte NOT the first byte of a character? */
+		count += (b >> 7) & ((~b) >> 6);
+	}
+
+	/* Handle complete blocks. */
+	for (; ; s += sizeof(size_t)) {
+		/* Prefetch 256 bytes ahead. */
+		__builtin_prefetch(&s[256], 0, 0);
+
+		/* Grab 4 or 8 bytes of UTF-8 data. */
+		u = *(size_t *)(s);
+
+		/* Exit the loop if there are any zero bytes. */
+		if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
+			break;
+
+		/* Count bytes which are NOT the first byte of a character. */
+		u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
+		count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
+	}
+
+	/* Take care of any left-over bytes. */
+	for (; ; s++) {
+		b = *s;
+
+		/* Exit if we hit a zero byte. */
+		if (b == '\0')
+			break;
+
+		/* Is this byte NOT the first byte of a character? */
+		count += (b >> 7) & ((~b) >> 6);
+	}
+
+done:
+	return ((s - _s) - count);
+}
-- 
cgit v1.2.3