aboutsummaryrefslogtreecommitdiffstats
path: root/Source/SPSQLTokenizer.l
diff options
context:
space:
mode:
authorBibiko <bibiko@eva.mpg.de>2009-05-14 15:50:53 +0000
committerBibiko <bibiko@eva.mpg.de>2009-05-14 15:50:53 +0000
commitbe4aac5c809f8f74c20f1d7b03a932e5ee0720df (patch)
treefc60de88fa04fe613f16c3b78b1a6f53a1eb1ebb /Source/SPSQLTokenizer.l
parentb60ee8e3720b0dac888f5d542869712a6c16e409 (diff)
downloadsequelpro-be4aac5c809f8f74c20f1d7b03a932e5ee0720df.tar.gz
sequelpro-be4aac5c809f8f74c20f1d7b03a932e5ee0720df.tar.bz2
sequelpro-be4aac5c809f8f74c20f1d7b03a932e5ee0720df.zip
• added SPSQLTokenizer
- this is an approach to make usage of lex to split a string very fast into SQL queries considering the "delimiter" switch and compound-statements via CREATE ... BEGIN ... END; without using "delimiter"
Diffstat (limited to 'Source/SPSQLTokenizer.l')
-rw-r--r--Source/SPSQLTokenizer.l134
1 files changed, 134 insertions, 0 deletions
diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l
new file mode 100644
index 00000000..95d0f76c
--- /dev/null
+++ b/Source/SPSQLTokenizer.l
@@ -0,0 +1,134 @@
+%{
+
+/*
+ * SPSQLTokenizer.l
+ * sequel-pro
+ *
+ * Created by Hans-J. Bibiko on May 14, 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * More info at <http://code.google.com/p/sequel-pro/>
+ */
+
+#import "SPSQLTokenizer.h"
+int utf8strlenfortoken(const char * _s);
+int yyuoffset, yyuleng;
+
+#define YY_NO_UNPUT
+
+//keep track of the current utf-8 character (not byte) offset and token length
+#define YY_USER_ACTION { yyuoffset += yyuleng; yyuleng = utf8strlenfortoken(yytext); }
+%}
+%option prefix="to"
+%option noyywrap
+%option case-insensitive
+
+s [ \t\n\r]
+dkey "delimiter"
+scol ";"
+dval [!-゚]
+compound "create"(.|\n|\r)+?"begin"(.|\n|\r)+?{s}+"end"{s}*?{scol}
+%x comment
+%x delim
+%x delimbody
+
+%%
+
+\"([^"\\]|\\(.|[\n\r]))*\"? { return SP_SQL_TOKEN_DOUBLE_QUOTED_TEXT; }
+'([^'\\]|\\(.|[\n\r]))*'? { return SP_SQL_TOKEN_SINGLE_QUOTED_TEXT; }
+`[^`]*`? { return SP_SQL_TOKEN_BACKTICK_QUOTED_TEXT; }
+
+"/*" { BEGIN(comment); return SP_SQL_TOKEN_COMMENT; }
+<comment>[^*]* { return SP_SQL_TOKEN_COMMENT; }
+<comment>"*"+ { return SP_SQL_TOKEN_COMMENT; }
+<comment>"*"+"/" { BEGIN(INITIAL); return SP_SQL_TOKEN_COMMENT; }
+#[^\n\r]*(\n|\r)? |
+--[ \t][^\n\r]*(\n|\r)? { return SP_SQL_TOKEN_COMMENT; }
+
+{s}+ { return SP_SQL_TOKEN_WHITESPACE; }
+
+{dkey}{s}+ { BEGIN(delim); return SP_SQL_TOKEN_DELIM_START; }
+<delim>{dval}+ { BEGIN(delimbody); return SP_SQL_TOKEN_DELIM_VALUE; }
+<delimbody>{s}+{dkey}{s}+{scol} { BEGIN(INITIAL); return SP_SQL_TOKEN_DELIM_END; }
+
+{compound} { return SP_SQL_TOKEN_COMPOUND; }
+
+{scol} { return SP_SQL_TOKEN_SEMICOLON; }
+[.\r\n] { return SP_SQL_TOKEN_WHITESPACE; }
+
+
+
+
+
+<<EOF>> {
+ BEGIN(INITIAL); /* make sure we return to initial state when finished! */
+ yy_delete_buffer(YY_CURRENT_BUFFER);
+ return 0;
+ }
+%%
+#define ONEMASK ((size_t)(-1) / 0xFF)
+// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
+int utf8strlenfortoken(const char * _s)
+{
+ const char * s;
+ size_t count = 0;
+ size_t u;
+ unsigned char b;
+
+ /* Handle any initial misaligned bytes. */
+ for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
+ b = *s;
+
+ /* Exit if we hit a zero byte. */
+ if (b == '\0')
+ goto done;
+
+ /* Is this byte NOT the first byte of a character? */
+ count += (b >> 7) & ((~b) >> 6);
+ }
+
+ /* Handle complete blocks. */
+ for (; ; s += sizeof(size_t)) {
+ /* Prefetch 256 bytes ahead. */
+ __builtin_prefetch(&s[256], 0, 0);
+
+ /* Grab 4 or 8 bytes of UTF-8 data. */
+ u = *(size_t *)(s);
+
+ /* Exit the loop if there are any zero bytes. */
+ if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
+ break;
+
+ /* Count bytes which are NOT the first byte of a character. */
+ u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
+ count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
+ }
+
+ /* Take care of any left-over bytes. */
+ for (; ; s++) {
+ b = *s;
+
+ /* Exit if we hit a zero byte. */
+ if (b == '\0')
+ break;
+
+ /* Is this byte NOT the first byte of a character? */
+ count += (b >> 7) & ((~b) >> 6);
+ }
+
+done:
+ return ((s - _s) - count);
+}