From be4aac5c809f8f74c20f1d7b03a932e5ee0720df Mon Sep 17 00:00:00 2001 From: Bibiko Date: Thu, 14 May 2009 15:50:53 +0000 Subject: =?UTF-8?q?=E2=80=A2=20added=20SPSQLTokenizer=20-=20this=20is=20an?= =?UTF-8?q?=20approach=20to=20make=20usage=20of=20lex=20to=20split=20a=20s?= =?UTF-8?q?tring=20very=20fast=20into=20SQL=20queries=20considering=20the?= =?UTF-8?q?=20"delimiter"=20switch=20and=20compound-statements=20via=20CRE?= =?UTF-8?q?ATE=20...=20BEGIN=20...=20END;=20without=20using=20"delimiter"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Source/SPSQLTokenizer.h | 32 ++++++++++++ Source/SPSQLTokenizer.l | 134 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 Source/SPSQLTokenizer.h create mode 100644 Source/SPSQLTokenizer.l (limited to 'Source') diff --git a/Source/SPSQLTokenizer.h b/Source/SPSQLTokenizer.h new file mode 100644 index 00000000..7f459440 --- /dev/null +++ b/Source/SPSQLTokenizer.h @@ -0,0 +1,32 @@ +// +// SPSQLTokenizer.h +// sequel-pro +// +// Created by Hans-J. Bibiko on May 14, 2009 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// More info at + +#define SP_SQL_TOKEN_DOUBLE_QUOTED_TEXT 1 +#define SP_SQL_TOKEN_SINGLE_QUOTED_TEXT 2 +#define SP_SQL_TOKEN_COMMENT 3 +#define SP_SQL_TOKEN_BACKTICK_QUOTED_TEXT 4 +#define SP_SQL_TOKEN_DELIM_START 5 +#define SP_SQL_TOKEN_DELIM_VALUE 6 +#define SP_SQL_TOKEN_DELIM_END 7 +#define SP_SQL_TOKEN_WHITESPACE 8 +#define SP_SQL_TOKEN_SEMICOLON 9 +#define SP_SQL_TOKEN_COMPOUND 10 diff --git a/Source/SPSQLTokenizer.l b/Source/SPSQLTokenizer.l new file mode 100644 index 00000000..95d0f76c --- /dev/null +++ b/Source/SPSQLTokenizer.l @@ -0,0 +1,134 @@ +%{ + +/* + * SPSQLTokenizer.l + * sequel-pro + * + * Created by Hans-J. Bibiko on May 14, 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * More info at + */ + +#import "SPSQLTokenizer.h" +int utf8strlenfortoken(const char * _s); +int yyuoffset, yyuleng; + +#define YY_NO_UNPUT + +//keep track of the current utf-8 character (not byte) offset and token length +#define YY_USER_ACTION { yyuoffset += yyuleng; yyuleng = utf8strlenfortoken(yytext); } +%} +%option prefix="to" +%option noyywrap +%option case-insensitive + +s [ \t\n\r] +dkey "delimiter" +scol ";" +dval [!-゚] +compound "create"(.|\n|\r)+?"begin"(.|\n|\r)+?{s}+"end"{s}*?{scol} +%x comment +%x delim +%x delimbody + +%% + +\"([^"\\]|\\(.|[\n\r]))*\"? { return SP_SQL_TOKEN_DOUBLE_QUOTED_TEXT; } +'([^'\\]|\\(.|[\n\r]))*'? { return SP_SQL_TOKEN_SINGLE_QUOTED_TEXT; } +`[^`]*`? { return SP_SQL_TOKEN_BACKTICK_QUOTED_TEXT; } + +"/*" { BEGIN(comment); return SP_SQL_TOKEN_COMMENT; } +[^*]* { return SP_SQL_TOKEN_COMMENT; } +"*"+ { return SP_SQL_TOKEN_COMMENT; } +"*"+"/" { BEGIN(INITIAL); return SP_SQL_TOKEN_COMMENT; } +#[^\n\r]*(\n|\r)? | +--[ \t][^\n\r]*(\n|\r)? { return SP_SQL_TOKEN_COMMENT; } + +{s}+ { return SP_SQL_TOKEN_WHITESPACE; } + +{dkey}{s}+ { BEGIN(delim); return SP_SQL_TOKEN_DELIM_START; } +{dval}+ { BEGIN(delimbody); return SP_SQL_TOKEN_DELIM_VALUE; } +{s}+{dkey}{s}+{scol} { BEGIN(INITIAL); return SP_SQL_TOKEN_DELIM_END; } + +{compound} { return SP_SQL_TOKEN_COMPOUND; } + +{scol} { return SP_SQL_TOKEN_SEMICOLON; } +[.\r\n] { return SP_SQL_TOKEN_WHITESPACE; } + + + + + +<> { + BEGIN(INITIAL); /* make sure we return to initial state when finished! */ + yy_delete_buffer(YY_CURRENT_BUFFER); + return 0; + } +%% +#define ONEMASK ((size_t)(-1) / 0xFF) +// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html +int utf8strlenfortoken(const char * _s) +{ + const char * s; + size_t count = 0; + size_t u; + unsigned char b; + + /* Handle any initial misaligned bytes. */ + for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) { + b = *s; + + /* Exit if we hit a zero byte. */ + if (b == '\0') + goto done; + + /* Is this byte NOT the first byte of a character? */ + count += (b >> 7) & ((~b) >> 6); + } + + /* Handle complete blocks. */ + for (; ; s += sizeof(size_t)) { + /* Prefetch 256 bytes ahead. */ + __builtin_prefetch(&s[256], 0, 0); + + /* Grab 4 or 8 bytes of UTF-8 data. */ + u = *(size_t *)(s); + + /* Exit the loop if there are any zero bytes. */ + if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80)) + break; + + /* Count bytes which are NOT the first byte of a character. */ + u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6); + count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8); + } + + /* Take care of any left-over bytes. */ + for (; ; s++) { + b = *s; + + /* Exit if we hit a zero byte. */ + if (b == '\0') + break; + + /* Is this byte NOT the first byte of a character? */ + count += (b >> 7) & ((~b) >> 6); + } + +done: + return ((s - _s) - count); +} -- cgit v1.2.3