diff options
author | rowanbeentje <rowan@beent.je> | 2009-09-28 00:43:41 +0000 |
---|---|---|
committer | rowanbeentje <rowan@beent.je> | 2009-09-28 00:43:41 +0000 |
commit | cc0c0a7842e3bff325fa29c71f5115361981797d (patch) | |
tree | 8c101a961ba86f1f04ddbba00ce76963d9c4f3e3 /Source/SPCSVParser.h | |
parent | 2183eeefefb81846c2cc2c6b4bf68b12167f2b24 (diff) | |
download | sequelpro-cc0c0a7842e3bff325fa29c71f5115361981797d.tar.gz sequelpro-cc0c0a7842e3bff325fa29c71f5115361981797d.tar.bz2 sequelpro-cc0c0a7842e3bff325fa29c71f5115361981797d.zip |
Rewrite CSV import:
- Replace the CSV parsing function (arrayForCSV:) with a new SPCSVParser class
- Make speed improvements to SPCSVParser to achieve 1.9x faster parsing than the old arrayForCSV: function
- Rewrite CSV imports to be performed as a streaming import, keeping memory usage much much lower
- CSV field mapping preview is now shown very early on in the import process, as soon as the first hundred rwos are available for a preview
- Progress bars are more consistent and accurate
- CSV rows are grouped into batches of up to 50 (depending on line length) for import, falling back to one-query-per-row if errors occur. The current error reporting level is therefore maintained, but imports of non-erroring data are much much faster.
- Improve processing speed slightly
- Fix some odd edge cases in CSV parsing
This addresses issue #389.
Diffstat (limited to 'Source/SPCSVParser.h')
-rw-r--r-- | Source/SPCSVParser.h | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/Source/SPCSVParser.h b/Source/SPCSVParser.h new file mode 100644 index 00000000..0f1a8a20 --- /dev/null +++ b/Source/SPCSVParser.h @@ -0,0 +1,117 @@ +// +// $Id$ +// +// SPCSVParser.h +// sequel-pro +// +// Created by Rowan Beentje on 16/09/2009. +// Copyright 2009 Rowan Beentje. All rights reserved. +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// More info at <http://code.google.com/p/sequel-pro/> + +#import <Cocoa/Cocoa.h> + +/* + * This class provides a string class intended for CSV parsing. Unlike SPSQLParser, this + * does not extend NSMutableString and instead provides only a subset of similar methods. + * Internally, an approach similar to NSScanner is used to support multi-character strings. + * The methods are designed with the intention that as a string is parsed the parsed content + * is removed. This also allows parsing to occur in "streaming" mode, with parseable content + * being pulled off the start of the string as additional content is appended onto the end of + * the string, eg from a file. + * + * Supports: + * - Control of field terminator, line terminator, string enclosures and escape characters. + * - Multi-character field terminator, line terminator, string enclosures, and escape strings. + * - Stream-based processing (recommended that strings split by \n or \r are used when streaming + * to minimise multibyte issues) + * - Correct treatment of line terminators within quoted strings and proper escape support + * including escape characters matching the quote characters in Excel style + * + * The internal usage of string range finding, similar to the NSScanner approach, means this + * could be significantly sped up for single-character terminators. + */ + +#define SPCSVPARSER_TRIM_ENACT_LENGTH 250000 + +@interface SPCSVParser : NSObject +{ + NSMutableString *csvString; + + long trimPosition; + long parserPosition; + long totalLengthParsed; + long csvStringLength; + int fieldCount; + + NSString *nullReplacementString; + NSString *fieldEndString; + NSString *lineEndString; + NSString *fieldQuoteString; + NSString *escapeString; + NSString *escapedFieldEndString; + NSString *escapedLineEndString; + NSString *escapedFieldQuoteString; + NSString *escapedEscapeString; + int fieldEndLength; + int lineEndLength; + int fieldQuoteLength; + int escapeLength; + NSCharacterSet *skipCharacterSet; + NSScanner *csvScanner; + + BOOL escapeStringIsFieldQuoteString; +} + +/* Retrieving data from the CSV string */ +- (NSArray *) array; +- (NSArray *) getRowAsArray; +- (NSArray *) getRowAsArrayAndTrimString:(BOOL)trimString stringIsComplete:(BOOL)stringComplete; + +/* Adding new data to the string */ +- (void) appendString:(NSString *)aString; +- (void) setString:(NSString *)aString; + +/* Basic information */ +- (NSUInteger) length; +- (NSString *) string; +- (long) parserPosition; +- (long) totalLengthParsed; + +/* Setting the terminator, quote, escape and null character replacement strings */ +- (void) setFieldTerminatorString:(NSString *)theString convertDisplayStrings:(BOOL)convertString; +- (void) setLineTerminatorString:(NSString *)theString convertDisplayStrings:(BOOL)convertString; +- (void) setFieldQuoteString:(NSString *)theString convertDisplayStrings:(BOOL)convertString; +- (void) setEscapeString:(NSString *)theString convertDisplayStrings:(BOOL)convertString; +- (void) setNullReplacementString:(NSString *)nullString; + +/* Init and internal update methods */ +- (void) _initialiseCSVParserDefaults; +- (void) _moveParserPastSkippableCharacters; +- (long) _getDistanceToString:(NSString *)theString; +- (void) _updateState; +- (NSString *) _convertDisplayString:(NSString *)theString; +- (void) _updateSkipCharacterSet; + +/* Initialisation and teardown */ +#pragma mark - +- (id) init; +- (id) initWithString:(NSString *)aString; +- (id) initWithContentsOfFile:(NSString *)path encoding:(NSStringEncoding)enc error:(NSError **)error; +- (void) dealloc; + +@end |