From 96b765ffbcb6c7fda058fbe8028b2e128007134a Mon Sep 17 00:00:00 2001 From: Max Date: Sat, 7 Mar 2015 20:48:55 +0100 Subject: Added an internal algorithm for fuzzy string matching * This works similar to a regex matching "abc" as /a.*b.*c/ (ie. all characters of $needle need to be contained in $haystack in the correct order but not neccesarily consecutive). Additionaly some unicode equivalencies are handled. * Changed a tiny helper function from ObjC to plain C --- Source/SPStringAdditions.h | 19 +++++++ Source/SPStringAdditions.m | 101 +++++++++++++++++++++++++++++++++---- UnitTests/SPStringAdditionsTests.h | 1 + UnitTests/SPStringAdditionsTests.m | 69 +++++++++++++++++++++++++ 4 files changed, 179 insertions(+), 11 deletions(-) diff --git a/Source/SPStringAdditions.h b/Source/SPStringAdditions.h index 3296326f..c6871e2a 100644 --- a/Source/SPStringAdditions.h +++ b/Source/SPStringAdditions.h @@ -81,4 +81,23 @@ static inline id NSMutableAttributedStringAttributeAtIndex(NSMutableAttributedSt - (CGFloat)levenshteinDistanceWithWord:(NSString *)stringB; +/** + * Checks if the string other is contained in self on a per-character basis. + * In regex-speak that would mean "abc" is matched as /a.*b.*c/ (not anchored). + * This is a SEARCH function, NOT a MATCHING function! + * Namely the following options will be applied when matching: + * NSCaseInsensitiveSearch|NSDiacriticInsensitiveSearch|NSDiacriticInsensitiveSearch + * Additionaly this method might match even when it should not. + * + * @param other String to match against self + * @param submatches Pass the pointer to a variable that will be set to an NSArray * + * of NSNumber *s of NSRanges. This will only be the case if + * the method also returns YES. The variable will not be modified + * otherwise. + * Pass NULL if you don't care for the ranges. + * The object will be set to autorelase. + * @return YES if self contains all characters from other in the order given in other + * @warning This method is NOT thread-safe (probably), NOT constant-time and DOES NOT check binary equivalence + */ +- (BOOL)nonConsecutivelySearchString:(NSString *)other matchingRanges:(NSArray **)submatches; @end diff --git a/Source/SPStringAdditions.m b/Source/SPStringAdditions.m index 29b4144d..0c623af7 100644 --- a/Source/SPStringAdditions.m +++ b/Source/SPStringAdditions.m @@ -31,11 +31,7 @@ #import "SPStringAdditions.h" #import "RegexKitLite.h" -@interface NSString (PrivateAPI) - -- (NSInteger)_smallestOf:(NSInteger)a andOf:(NSInteger)b andOf:(NSInteger)c; - -@end +static NSInteger _smallestOf(NSInteger a, NSInteger b, NSInteger c); @implementation NSString (SPStringAdditions) @@ -426,9 +422,9 @@ cost = ([stringA characterAtIndex:i - 1] == [stringB characterAtIndex:j - 1]) ? 0 : 1; d[j * n + i] = - [self _smallestOf:d[(j - 1) * n + i] + 1 - andOf:d[j * n + i - 1] + 1 - andOf:d[(j - 1) * n + i -1] + cost]; + _smallestOf(d[(j - 1) * n + i] + 1, + d[j * n + i - 1] + 1, + d[(j - 1) * n + i -1] + cost); } distance = d[n * m - 1]; @@ -468,10 +464,95 @@ } } +- (BOOL)nonConsecutivelySearchString:(NSString *)other matchingRanges:(NSArray **)submatches +{ + NSStringCompareOptions opts = NSCaseInsensitiveSearch|NSDiacriticInsensitiveSearch|NSWidthInsensitiveSearch; + BOOL recordMatches = (submatches != NULL); //for readability + NSRange selfRange = NSMakeRange(0, [self length]); + + //shortcut: * a longer string can never be contained in a shorter one. + // * nil can never match in a string. + // * an empty other can never match if self is non-empty + if(([other length] > [self length]) || (!other) || ([self length] && ![other length])) + return NO; + + //handle the simple case via the default algorithm + if ([self compare:other options:opts] == NSOrderedSame) { + if(recordMatches) { + *submatches = [NSArray arrayWithObject:[NSValue valueWithRange:selfRange]]; + } + return YES; + } + + // for now let's save the overhead of NSArray and NSValues + NSRange *tmpMatchStore = recordMatches? calloc([other length], sizeof(NSRange)) : NULL; + __block NSUInteger matchCount = 0; + __block NSRange searchRange = selfRange; + + //this looks a bit silly but is basically Apples approach to handling multibyte charsets + void (^it)(NSString *,NSRange,NSRange,BOOL *) = ^(NSString *substring,NSRange substringRange,NSRange enclosingRange,BOOL *stop) { + //look for the current character of other in the remaining part of self + NSRange found = [self rangeOfString:substring options:opts range:searchRange]; + if(found.location == NSNotFound) { + matchCount = 0; //reset match count to "no match" + *stop = YES; + return; + } + if(recordMatches) + tmpMatchStore[matchCount] = found; + matchCount++; + //move the next search past the current match + searchRange.location = found.location + found.length; + searchRange.length = [self length] - searchRange.location; + }; + + [other enumerateSubstringsInRange:NSMakeRange(0, [other length]) + options:NSStringEnumerationByComposedCharacterSequences + usingBlock:it]; + + if(matchCount && recordMatches) { + //we want to re-combine sub-matches that are actually consecutive + + //This algorithm uses a simple look-behind for merges: + // Object 1 checks if it continues object 0. If so, 1 and 0 will merge + // and be placed in the slot of 0 (slot 1 is now invalid). + // Then object 2 checks if it continues object 0. If it does not, it is + // placed in slot 1. + // Object 3 then checks if it continues object 1 and so on... + NSUInteger mergeTarget = 0; + for (NSUInteger i = 1; i < matchCount; i++ ) { + NSRange prev = tmpMatchStore[mergeTarget]; + NSRange this = tmpMatchStore[i]; + //if the previous match ends right before this match begins we can merge them + if(prev.location + prev.length == this.location) { + NSRange mergedRange = NSMakeRange(prev.location, prev.length+this.length); + tmpMatchStore[mergeTarget] = mergedRange; + } + //otherwise we have to move on to the next and make ourselves the new base + else { + if(++mergeTarget != i) + tmpMatchStore[mergeTarget] = this; + } + } + + NSMutableArray *combinedArray = [NSMutableArray arrayWithCapacity:mergeTarget+1]; + for (NSUInteger j = 0; j <= mergeTarget; j++) { + [combinedArray addObject:[NSValue valueWithRange:tmpMatchStore[j]]]; + } + + *submatches = combinedArray; + } + + free(tmpMatchStore); // free(NULL) is safe as per OS X man page + return (matchCount > 0); +} + +@end + /** * Returns the minimum of a, b and c. */ -- (NSInteger)_smallestOf:(NSInteger)a andOf:(NSInteger)b andOf:(NSInteger)c +static NSInteger _smallestOf(NSInteger a, NSInteger b, NSInteger c) { NSInteger min = a; @@ -481,5 +562,3 @@ return min; } - -@end diff --git a/UnitTests/SPStringAdditionsTests.h b/UnitTests/SPStringAdditionsTests.h index d6854dbe..56b5c6f9 100644 --- a/UnitTests/SPStringAdditionsTests.h +++ b/UnitTests/SPStringAdditionsTests.h @@ -35,5 +35,6 @@ - (void)testStringByRemovingCharactersInSet; - (void)testStringWithNewUUID; - (void)testCreateViewSyntaxPrettifier; +- (void)testNonConsecutivelySearchStringMatchingRanges; @end diff --git a/UnitTests/SPStringAdditionsTests.m b/UnitTests/SPStringAdditionsTests.m index af22df77..f455066f 100644 --- a/UnitTests/SPStringAdditionsTests.m +++ b/UnitTests/SPStringAdditionsTests.m @@ -86,4 +86,73 @@ STAssertEqualObjects([actualSyntax description], [expectedSyntax description], @"Actual view syntax '%@' does not equal expected syntax '%@'", actualSyntax, expectedSyntax); } +- (void)testNonConsecutivelySearchStringMatchingRanges +{ + //basic tests + { + NSArray *matches = nil; + STAssertTrue([@"" nonConsecutivelySearchString:@"" matchingRanges:&matches], @"Equality of empty strings"); + STAssertTrue(([matches count] == 1) && NSEqualRanges(NSMakeRange(0, 0), [(NSValue *)[matches objectAtIndex:0] rangeValue]), @"Returned matches in empty string"); + } + + { + NSArray *matches = (void *)0xdeadbeef; + STAssertFalse([@"" nonConsecutivelySearchString:@"R" matchingRanges:&matches], @"Inequality with empty left side"); + STAssertTrue((matches == (void *)0xdeadbeef), @"out variable not touched by mismatch"); + } + + STAssertFalse([@"L" nonConsecutivelySearchString:@"" matchingRanges:NULL], @"Inequality with empty right side"); + + { + NSArray *matches = nil; + STAssertTrue([@"left" nonConsecutivelySearchString:@"le" matchingRanges:&matches], @"Anchored match left"); + STAssertTrue(([matches count] == 1) && NSEqualRanges(NSMakeRange(0, 2), [(NSValue *)[matches objectAtIndex:0] rangeValue]), @"Returned matches in anchored left match"); + } + + { + NSArray *matches = nil; + STAssertTrue([@"right" nonConsecutivelySearchString:@"ht" matchingRanges:&matches], @"Anchored match right"); + STAssertTrue(([matches count] == 1) && NSEqualRanges(NSMakeRange(3, 2), [(NSValue *)[matches objectAtIndex:0] rangeValue]), @"Returned matches in anchroed right match"); + } + + STAssertFalse([@"ht" nonConsecutivelySearchString:@"right" matchingRanges:NULL], @"Left and Right are not commutative"); + + //real tests + { + NSArray *matches = nil; + STAssertTrue([@"... is not secure anymore!" nonConsecutivelySearchString:@"NSA" matchingRanges:&matches], @"Non-consecutive match, ignoring case"); + STAssertTrue(([matches count] == 3) && + (NSEqualRanges(NSMakeRange(7, 1), [(NSValue *)[matches objectAtIndex:0] rangeValue])) && + (NSEqualRanges(NSMakeRange(11, 1), [(NSValue *)[matches objectAtIndex:1] rangeValue])) && + (NSEqualRanges(NSMakeRange(18, 1), [(NSValue *)[matches objectAtIndex:2] rangeValue])), @"Returned matches in non-consecutive string"); + } + + STAssertFalse([@"Deoxyribonucleic Acid" nonConsecutivelySearchString:@"DNS" matchingRanges:NULL], @"Non-consecutive mismatch"); + + { + NSArray *matches = nil; + STAssertTrue([@"Turn left, then right at the corner" nonConsecutivelySearchString:@"left right" matchingRanges:&matches], @"Partly consecutive match"); + STAssertTrue(([matches count] == 3) && + (NSEqualRanges(NSMakeRange(5, 4), [(NSValue *)[matches objectAtIndex:0] rangeValue])) && + (NSEqualRanges(NSMakeRange(10, 1), [(NSValue *)[matches objectAtIndex:1] rangeValue])) && + (NSEqualRanges(NSMakeRange(16, 5), [(NSValue *)[matches objectAtIndex:2] rangeValue])), @"Returned matches in partly-consecutive string"); + } + + //advanced tests + + // LATIN CAPITAL LETTER A == LATIN SMALL LETTER A + // LATIN SMALL LETTER O WITH DIAERESIS == LATIN SMALL LETTER O + // FULLWIDTH LATIN SMALL LETTER b == LATIN SMALL LETTER B + STAssertTrue([@"A:\xC3\xB6:\xEF\xBD\x82" nonConsecutivelySearchString:@"aob" matchingRanges:NULL], @"Fuzzy matching of defined characters"); + + //all bytes on the right are contained on the left, but on a character level "ä" is not contained in "Hütte Ф" + STAssertFalse([@"H\xC3\xBCtte \xD0\xA4" nonConsecutivelySearchString:@"\xC3\xA4" matchingRanges:NULL], @"Mismatch of composed characters with same prefix"); + + // ":😥:𠘄:" vs "😄" (according to wikipedia "𠘄" is the arachic variant of "印") + // TECHNICALLY THIS SHOULD NOT MATCH! + // However Apple doesn't correctly handle characters in the 4-Byte UTF range, so let's use this test to check for changes in Apples behaviour :) + STAssertTrue([@":\xF0\x9F\x98\x84:\xF0\xA0\x98\x84:" nonConsecutivelySearchString:@"\xF0\x9F\x98\x84" matchingRanges:NULL], @"Mismatch of composed characters (4-byte) with same prefix"); + +} + @end -- cgit v1.2.3