aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMax <post@wickenrode.com>2015-03-07 20:48:55 +0100
committerMax <post@wickenrode.com>2015-03-07 20:48:55 +0100
commit96b765ffbcb6c7fda058fbe8028b2e128007134a (patch)
treea4ac6c9fb75b5b810f7c42f1d159937caa6c2dda
parented38f229a052a678d3a5022afd3806b1c3b434cf (diff)
downloadsequelpro-96b765ffbcb6c7fda058fbe8028b2e128007134a.tar.gz
sequelpro-96b765ffbcb6c7fda058fbe8028b2e128007134a.tar.bz2
sequelpro-96b765ffbcb6c7fda058fbe8028b2e128007134a.zip
Added an internal algorithm for fuzzy string matching
* This works similar to a regex matching "abc" as /a.*b.*c/ (ie. all characters of $needle need to be contained in $haystack in the correct order but not neccesarily consecutive). Additionaly some unicode equivalencies are handled. * Changed a tiny helper function from ObjC to plain C
-rw-r--r--Source/SPStringAdditions.h19
-rw-r--r--Source/SPStringAdditions.m101
-rw-r--r--UnitTests/SPStringAdditionsTests.h1
-rw-r--r--UnitTests/SPStringAdditionsTests.m69
4 files changed, 179 insertions, 11 deletions
diff --git a/Source/SPStringAdditions.h b/Source/SPStringAdditions.h
index 3296326f..c6871e2a 100644
--- a/Source/SPStringAdditions.h
+++ b/Source/SPStringAdditions.h
@@ -81,4 +81,23 @@ static inline id NSMutableAttributedStringAttributeAtIndex(NSMutableAttributedSt
- (CGFloat)levenshteinDistanceWithWord:(NSString *)stringB;
+/**
+ * Checks if the string other is contained in self on a per-character basis.
+ * In regex-speak that would mean "abc" is matched as /a.*b.*c/ (not anchored).
+ * This is a SEARCH function, NOT a MATCHING function!
+ * Namely the following options will be applied when matching:
+ * NSCaseInsensitiveSearch|NSDiacriticInsensitiveSearch|NSDiacriticInsensitiveSearch
+ * Additionaly this method might match even when it should not.
+ *
+ * @param other String to match against self
+ * @param submatches Pass the pointer to a variable that will be set to an NSArray *
+ * of NSNumber *s of NSRanges. This will only be the case if
+ * the method also returns YES. The variable will not be modified
+ * otherwise.
+ * Pass NULL if you don't care for the ranges.
+ * The object will be set to autorelase.
+ * @return YES if self contains all characters from other in the order given in other
+ * @warning This method is NOT thread-safe (probably), NOT constant-time and DOES NOT check binary equivalence
+ */
+- (BOOL)nonConsecutivelySearchString:(NSString *)other matchingRanges:(NSArray **)submatches;
@end
diff --git a/Source/SPStringAdditions.m b/Source/SPStringAdditions.m
index 29b4144d..0c623af7 100644
--- a/Source/SPStringAdditions.m
+++ b/Source/SPStringAdditions.m
@@ -31,11 +31,7 @@
#import "SPStringAdditions.h"
#import "RegexKitLite.h"
-@interface NSString (PrivateAPI)
-
-- (NSInteger)_smallestOf:(NSInteger)a andOf:(NSInteger)b andOf:(NSInteger)c;
-
-@end
+static NSInteger _smallestOf(NSInteger a, NSInteger b, NSInteger c);
@implementation NSString (SPStringAdditions)
@@ -426,9 +422,9 @@
cost = ([stringA characterAtIndex:i - 1] == [stringB characterAtIndex:j - 1]) ? 0 : 1;
d[j * n + i] =
- [self _smallestOf:d[(j - 1) * n + i] + 1
- andOf:d[j * n + i - 1] + 1
- andOf:d[(j - 1) * n + i -1] + cost];
+ _smallestOf(d[(j - 1) * n + i] + 1,
+ d[j * n + i - 1] + 1,
+ d[(j - 1) * n + i -1] + cost);
}
distance = d[n * m - 1];
@@ -468,10 +464,95 @@
}
}
+- (BOOL)nonConsecutivelySearchString:(NSString *)other matchingRanges:(NSArray **)submatches
+{
+ NSStringCompareOptions opts = NSCaseInsensitiveSearch|NSDiacriticInsensitiveSearch|NSWidthInsensitiveSearch;
+ BOOL recordMatches = (submatches != NULL); //for readability
+ NSRange selfRange = NSMakeRange(0, [self length]);
+
+ //shortcut: * a longer string can never be contained in a shorter one.
+ // * nil can never match in a string.
+ // * an empty other can never match if self is non-empty
+ if(([other length] > [self length]) || (!other) || ([self length] && ![other length]))
+ return NO;
+
+ //handle the simple case via the default algorithm
+ if ([self compare:other options:opts] == NSOrderedSame) {
+ if(recordMatches) {
+ *submatches = [NSArray arrayWithObject:[NSValue valueWithRange:selfRange]];
+ }
+ return YES;
+ }
+
+ // for now let's save the overhead of NSArray and NSValues
+ NSRange *tmpMatchStore = recordMatches? calloc([other length], sizeof(NSRange)) : NULL;
+ __block NSUInteger matchCount = 0;
+ __block NSRange searchRange = selfRange;
+
+ //this looks a bit silly but is basically Apples approach to handling multibyte charsets
+ void (^it)(NSString *,NSRange,NSRange,BOOL *) = ^(NSString *substring,NSRange substringRange,NSRange enclosingRange,BOOL *stop) {
+ //look for the current character of other in the remaining part of self
+ NSRange found = [self rangeOfString:substring options:opts range:searchRange];
+ if(found.location == NSNotFound) {
+ matchCount = 0; //reset match count to "no match"
+ *stop = YES;
+ return;
+ }
+ if(recordMatches)
+ tmpMatchStore[matchCount] = found;
+ matchCount++;
+ //move the next search past the current match
+ searchRange.location = found.location + found.length;
+ searchRange.length = [self length] - searchRange.location;
+ };
+
+ [other enumerateSubstringsInRange:NSMakeRange(0, [other length])
+ options:NSStringEnumerationByComposedCharacterSequences
+ usingBlock:it];
+
+ if(matchCount && recordMatches) {
+ //we want to re-combine sub-matches that are actually consecutive
+
+ //This algorithm uses a simple look-behind for merges:
+ // Object 1 checks if it continues object 0. If so, 1 and 0 will merge
+ // and be placed in the slot of 0 (slot 1 is now invalid).
+ // Then object 2 checks if it continues object 0. If it does not, it is
+ // placed in slot 1.
+ // Object 3 then checks if it continues object 1 and so on...
+ NSUInteger mergeTarget = 0;
+ for (NSUInteger i = 1; i < matchCount; i++ ) {
+ NSRange prev = tmpMatchStore[mergeTarget];
+ NSRange this = tmpMatchStore[i];
+ //if the previous match ends right before this match begins we can merge them
+ if(prev.location + prev.length == this.location) {
+ NSRange mergedRange = NSMakeRange(prev.location, prev.length+this.length);
+ tmpMatchStore[mergeTarget] = mergedRange;
+ }
+ //otherwise we have to move on to the next and make ourselves the new base
+ else {
+ if(++mergeTarget != i)
+ tmpMatchStore[mergeTarget] = this;
+ }
+ }
+
+ NSMutableArray *combinedArray = [NSMutableArray arrayWithCapacity:mergeTarget+1];
+ for (NSUInteger j = 0; j <= mergeTarget; j++) {
+ [combinedArray addObject:[NSValue valueWithRange:tmpMatchStore[j]]];
+ }
+
+ *submatches = combinedArray;
+ }
+
+ free(tmpMatchStore); // free(NULL) is safe as per OS X man page
+ return (matchCount > 0);
+}
+
+@end
+
/**
* Returns the minimum of a, b and c.
*/
-- (NSInteger)_smallestOf:(NSInteger)a andOf:(NSInteger)b andOf:(NSInteger)c
+static NSInteger _smallestOf(NSInteger a, NSInteger b, NSInteger c)
{
NSInteger min = a;
@@ -481,5 +562,3 @@
return min;
}
-
-@end
diff --git a/UnitTests/SPStringAdditionsTests.h b/UnitTests/SPStringAdditionsTests.h
index d6854dbe..56b5c6f9 100644
--- a/UnitTests/SPStringAdditionsTests.h
+++ b/UnitTests/SPStringAdditionsTests.h
@@ -35,5 +35,6 @@
- (void)testStringByRemovingCharactersInSet;
- (void)testStringWithNewUUID;
- (void)testCreateViewSyntaxPrettifier;
+- (void)testNonConsecutivelySearchStringMatchingRanges;
@end
diff --git a/UnitTests/SPStringAdditionsTests.m b/UnitTests/SPStringAdditionsTests.m
index af22df77..f455066f 100644
--- a/UnitTests/SPStringAdditionsTests.m
+++ b/UnitTests/SPStringAdditionsTests.m
@@ -86,4 +86,73 @@
STAssertEqualObjects([actualSyntax description], [expectedSyntax description], @"Actual view syntax '%@' does not equal expected syntax '%@'", actualSyntax, expectedSyntax);
}
+- (void)testNonConsecutivelySearchStringMatchingRanges
+{
+ //basic tests
+ {
+ NSArray *matches = nil;
+ STAssertTrue([@"" nonConsecutivelySearchString:@"" matchingRanges:&matches], @"Equality of empty strings");
+ STAssertTrue(([matches count] == 1) && NSEqualRanges(NSMakeRange(0, 0), [(NSValue *)[matches objectAtIndex:0] rangeValue]), @"Returned matches in empty string");
+ }
+
+ {
+ NSArray *matches = (void *)0xdeadbeef;
+ STAssertFalse([@"" nonConsecutivelySearchString:@"R" matchingRanges:&matches], @"Inequality with empty left side");
+ STAssertTrue((matches == (void *)0xdeadbeef), @"out variable not touched by mismatch");
+ }
+
+ STAssertFalse([@"L" nonConsecutivelySearchString:@"" matchingRanges:NULL], @"Inequality with empty right side");
+
+ {
+ NSArray *matches = nil;
+ STAssertTrue([@"left" nonConsecutivelySearchString:@"le" matchingRanges:&matches], @"Anchored match left");
+ STAssertTrue(([matches count] == 1) && NSEqualRanges(NSMakeRange(0, 2), [(NSValue *)[matches objectAtIndex:0] rangeValue]), @"Returned matches in anchored left match");
+ }
+
+ {
+ NSArray *matches = nil;
+ STAssertTrue([@"right" nonConsecutivelySearchString:@"ht" matchingRanges:&matches], @"Anchored match right");
+ STAssertTrue(([matches count] == 1) && NSEqualRanges(NSMakeRange(3, 2), [(NSValue *)[matches objectAtIndex:0] rangeValue]), @"Returned matches in anchroed right match");
+ }
+
+ STAssertFalse([@"ht" nonConsecutivelySearchString:@"right" matchingRanges:NULL], @"Left and Right are not commutative");
+
+ //real tests
+ {
+ NSArray *matches = nil;
+ STAssertTrue([@"... is not secure anymore!" nonConsecutivelySearchString:@"NSA" matchingRanges:&matches], @"Non-consecutive match, ignoring case");
+ STAssertTrue(([matches count] == 3) &&
+ (NSEqualRanges(NSMakeRange(7, 1), [(NSValue *)[matches objectAtIndex:0] rangeValue])) &&
+ (NSEqualRanges(NSMakeRange(11, 1), [(NSValue *)[matches objectAtIndex:1] rangeValue])) &&
+ (NSEqualRanges(NSMakeRange(18, 1), [(NSValue *)[matches objectAtIndex:2] rangeValue])), @"Returned matches in non-consecutive string");
+ }
+
+ STAssertFalse([@"Deoxyribonucleic Acid" nonConsecutivelySearchString:@"DNS" matchingRanges:NULL], @"Non-consecutive mismatch");
+
+ {
+ NSArray *matches = nil;
+ STAssertTrue([@"Turn left, then right at the corner" nonConsecutivelySearchString:@"left right" matchingRanges:&matches], @"Partly consecutive match");
+ STAssertTrue(([matches count] == 3) &&
+ (NSEqualRanges(NSMakeRange(5, 4), [(NSValue *)[matches objectAtIndex:0] rangeValue])) &&
+ (NSEqualRanges(NSMakeRange(10, 1), [(NSValue *)[matches objectAtIndex:1] rangeValue])) &&
+ (NSEqualRanges(NSMakeRange(16, 5), [(NSValue *)[matches objectAtIndex:2] rangeValue])), @"Returned matches in partly-consecutive string");
+ }
+
+ //advanced tests
+
+ // LATIN CAPITAL LETTER A == LATIN SMALL LETTER A
+ // LATIN SMALL LETTER O WITH DIAERESIS == LATIN SMALL LETTER O
+ // FULLWIDTH LATIN SMALL LETTER b == LATIN SMALL LETTER B
+ STAssertTrue([@"A:\xC3\xB6:\xEF\xBD\x82" nonConsecutivelySearchString:@"aob" matchingRanges:NULL], @"Fuzzy matching of defined characters");
+
+ //all bytes on the right are contained on the left, but on a character level "ä" is not contained in "Hütte Ф"
+ STAssertFalse([@"H\xC3\xBCtte \xD0\xA4" nonConsecutivelySearchString:@"\xC3\xA4" matchingRanges:NULL], @"Mismatch of composed characters with same prefix");
+
+ // ":😥:𠘄:" vs "😄" (according to wikipedia "𠘄" is the arachic variant of "印")
+ // TECHNICALLY THIS SHOULD NOT MATCH!
+ // However Apple doesn't correctly handle characters in the 4-Byte UTF range, so let's use this test to check for changes in Apples behaviour :)
+ STAssertTrue([@":\xF0\x9F\x98\x84:\xF0\xA0\x98\x84:" nonConsecutivelySearchString:@"\xF0\x9F\x98\x84" matchingRanges:NULL], @"Mismatch of composed characters (4-byte) with same prefix");
+
+}
+
@end