diff options
Diffstat (limited to 'Source/SPParserUtils.c')
-rw-r--r-- | Source/SPParserUtils.c | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/Source/SPParserUtils.c b/Source/SPParserUtils.c new file mode 100644 index 00000000..69929cee --- /dev/null +++ b/Source/SPParserUtils.c @@ -0,0 +1,118 @@ +// +// SPParserUtils.c +// sequel-pro +// +// Created by Max Lohrmann on 27.01.15. +// Relocated from existing files. Previous copyright applies. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// More info at <https://github.com/sequelpro/sequelpro> + +#include "SPParserUtils.h" +#include <stdint.h> + +#define SIZET (sizeof(size_t)) +#define SIZET1 (SIZET - 1) +#define SBYTE (SIZET1 * 8) + +#define ONEMASK ((size_t)(-1) / 0xFF) +#define ONEMASK8 (ONEMASK * 0x80) +#define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1) + +// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html +size_t utf8strlen(const char * _s) +{ + + /* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2} + "correct" the variable 'count' by subtraction the number + of occurrences of the start byte 0xF0 (4-byte UTF-8 char). + Here we assume that only up to 4-byte UTF-8 chars + are allowed [latest UTF-8 specification]. + + Marked in the source code by "CORRECT". + */ + + const char * s; + long count = 0; + size_t u = 0; + size_t u1 = 0; + unsigned char b; + + + /* Handle any initial misaligned bytes. */ + for (s = _s; (uintptr_t)(s) & SIZET1; s++) { + b = *s; + + /* Exit if we hit a zero byte. */ + if (b == '\0') + goto done; + + /* Is this byte NOT the first byte of a character? */ + count += (b >> 7) & ((~b) >> 6); + + /* CORRECT */ + count -= (b & 0xf0) == 0xf0; + } + + /* Handle complete blocks. */ + for (; ; s += SIZET) { + /* Prefetch 256 bytes ahead. */ + __builtin_prefetch(&s[256], 0, 0); + + /* Grab 4 or 8 bytes of UTF-8 data. */ + u = *(size_t *)(s); + + /* Exit the loop if there are any zero bytes. */ + if ((u - ONEMASK) & (~u) & ONEMASK8) + break; + + /* CORRECT */ + u1 = u & FMASK; + u1 = (u1 >> 7) & (u1 >> 6) & (u1 >> 5) & (u1 >> 4); + if (u1) count -= (u1 * ONEMASK) >> SBYTE; + + /* Count bytes which are NOT the first byte of a character. */ + u = ((u & ONEMASK8) >> 7) & ((~u) >> 6); + + count += (u * ONEMASK) >> SBYTE; + + } + + /* Take care of any left-over bytes. */ + for (; ; s++) { + b = *s; + + /* Exit if we hit a zero byte. */ + if (b == '\0') + break; + + /* Is this byte NOT the first byte of a character? */ + count += (b >> 7) & ((~b) >> 6); + + /* CORRECT */ + count -= (b & 0xf0) == 0xf0; + } + +done: + return ((s - _s) - count); +} |