For UTF16 and UTF32 encoded files the charset autodetection logic did not return a specific enough value, resulting in Cocoa sometimes guessing the wrong byte order even when a BOM was present (part of #2860)

author: Max <post@wickenrode.com> 2017-09-06 00:42:48 +0200
committer: Max <post@wickenrode.com> 2017-09-06 00:50:13 +0200
commit: 3477d22387355f9e073af2e2f0b67e65a58b217d (patch)
tree: 120dd2ed9337e5007a61d5cf65334b58a425d6dc /Source/SPFileManagerAdditions.m
parent: e19029b0b1f8d6cd81530993efbe78b84bad0337 (diff)
download: sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.tar.gz
sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.tar.bz2
sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.zip
1 files changed, 25 insertions, 1 deletions
diff --git a/Source/SPFileManagerAdditions.m b/Source/SPFileManagerAdditions.m
index 3ddc2e1a..225d7125 100644
--- a/Source/SPFileManagerAdditions.m
+++ b/Source/SPFileManagerAdditions.m
@@ -171,9 +171,33 @@ static NSString *DirectoryLocationDomain = @"DirectoryLocationDomain";
 	}
 
 	UniversalDetector *fileEncodingDetector = [[UniversalDetector alloc] init];
-	[fileEncodingDetector analyzeData:[detectorFileHandle readDataOfLength:5000000]];
+	NSData *startData = [detectorFileHandle readDataOfLength:5000000];
+	[fileEncodingDetector analyzeData:startData];
 	detectedEncoding = [fileEncodingDetector encoding];
 	[fileEncodingDetector release];
+	
+	// #2860: NSUnicodeStringEncoding is itself an autodetect encoding, meaning "any UTF16 variant".
+	//        Which means that value is rather useless if we want to pass it to some byte-to-string method later,
+	//        since it may guess wrong again. Nevertheless UniversalDetector may return that "encoding".
+	//
+	//        So, if we have a BOM, let's be exact instead! That wouldn't matter if you try to convert exactly those bytes
+	//        for which you invoked this method, since NSString will itself find the BOM and read the data accordingly,
+	//        but if you do the conversion in chunks, only the first one will have the helping BOM and all following
+	//        chunks may be guessed wrong when using the unspecific NSUnicodeStringEncoding.
+	//
+	//        Apple's implementation for all byte-to-NSString methods can be found in __CFStringDecodeByteStream3()
+	
+	// Note: NSUTF16StringEncoding == NSUnicodeStringEncoding
+	if(detectedEncoding == NSUnicodeStringEncoding && [startData length] >= 2) {
+		const UInt8 *bytes = [startData bytes];
+		if(bytes[0] == 0xFE && bytes[1] == 0xFF) detectedEncoding = NSUTF16BigEndianStringEncoding;
+		else if(bytes[0] == 0xFF && bytes[1] == 0xFE) detectedEncoding = NSUTF16LittleEndianStringEncoding;
+	}
+	else if(detectedEncoding == NSUTF32StringEncoding && [startData length] >= 4) {
+		const UInt8 *bytes = [startData bytes];
+		if(bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0xFE && bytes[3] == 0xFF) detectedEncoding = NSUTF32BigEndianStringEncoding;
+		else if(bytes[0] == 0xFF && bytes[1] == 0xFE && bytes[2] == 0 && bytes[3] == 0) detectedEncoding = NSUTF32LittleEndianStringEncoding;
+	}
 
 	return detectedEncoding;
 }
author	Max <post@wickenrode.com>	2017-09-06 00:42:48 +0200
committer	Max <post@wickenrode.com>	2017-09-06 00:50:13 +0200
commit	3477d22387355f9e073af2e2f0b67e65a58b217d (patch)
tree	120dd2ed9337e5007a61d5cf65334b58a425d6dc /Source/SPFileManagerAdditions.m
parent	e19029b0b1f8d6cd81530993efbe78b84bad0337 (diff)
download	sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.tar.gz sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.tar.bz2 sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.zip