diff options
author | Max <post@wickenrode.com> | 2017-09-06 00:42:48 +0200 |
---|---|---|
committer | Max <post@wickenrode.com> | 2017-09-06 00:50:13 +0200 |
commit | 3477d22387355f9e073af2e2f0b67e65a58b217d (patch) | |
tree | 120dd2ed9337e5007a61d5cf65334b58a425d6dc /Source | |
parent | e19029b0b1f8d6cd81530993efbe78b84bad0337 (diff) | |
download | sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.tar.gz sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.tar.bz2 sequelpro-3477d22387355f9e073af2e2f0b67e65a58b217d.zip |
For UTF16 and UTF32 encoded files the charset autodetection logic did not return a specific enough value, resulting in Cocoa sometimes guessing the wrong byte order even when a BOM was present (part of #2860)
Diffstat (limited to 'Source')
-rw-r--r-- | Source/SPFileManagerAdditions.m | 26 |
1 files changed, 25 insertions, 1 deletions
diff --git a/Source/SPFileManagerAdditions.m b/Source/SPFileManagerAdditions.m index 3ddc2e1a..225d7125 100644 --- a/Source/SPFileManagerAdditions.m +++ b/Source/SPFileManagerAdditions.m @@ -171,9 +171,33 @@ static NSString *DirectoryLocationDomain = @"DirectoryLocationDomain"; } UniversalDetector *fileEncodingDetector = [[UniversalDetector alloc] init]; - [fileEncodingDetector analyzeData:[detectorFileHandle readDataOfLength:5000000]]; + NSData *startData = [detectorFileHandle readDataOfLength:5000000]; + [fileEncodingDetector analyzeData:startData]; detectedEncoding = [fileEncodingDetector encoding]; [fileEncodingDetector release]; + + // #2860: NSUnicodeStringEncoding is itself an autodetect encoding, meaning "any UTF16 variant". + // Which means that value is rather useless if we want to pass it to some byte-to-string method later, + // since it may guess wrong again. Nevertheless UniversalDetector may return that "encoding". + // + // So, if we have a BOM, let's be exact instead! That wouldn't matter if you try to convert exactly those bytes + // for which you invoked this method, since NSString will itself find the BOM and read the data accordingly, + // but if you do the conversion in chunks, only the first one will have the helping BOM and all following + // chunks may be guessed wrong when using the unspecific NSUnicodeStringEncoding. + // + // Apple's implementation for all byte-to-NSString methods can be found in __CFStringDecodeByteStream3() + + // Note: NSUTF16StringEncoding == NSUnicodeStringEncoding + if(detectedEncoding == NSUnicodeStringEncoding && [startData length] >= 2) { + const UInt8 *bytes = [startData bytes]; + if(bytes[0] == 0xFE && bytes[1] == 0xFF) detectedEncoding = NSUTF16BigEndianStringEncoding; + else if(bytes[0] == 0xFF && bytes[1] == 0xFE) detectedEncoding = NSUTF16LittleEndianStringEncoding; + } + else if(detectedEncoding == NSUTF32StringEncoding && [startData length] >= 4) { + const UInt8 *bytes = [startData bytes]; + if(bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0xFE && bytes[3] == 0xFF) detectedEncoding = NSUTF32BigEndianStringEncoding; + else if(bytes[0] == 0xFF && bytes[1] == 0xFE && bytes[2] == 0 && bytes[3] == 0) detectedEncoding = NSUTF32LittleEndianStringEncoding; + } return detectedEncoding; } |