From a80ae1d0c40773d099937b2036e4f0aff1ffd137 Mon Sep 17 00:00:00 2001 From: David Duponchel Date: Tue, 14 Apr 2015 20:40:30 +0200 Subject: [PATCH 1/2] Add decodeFileName in load(). Some archive managers use the machine encoding to generate the file name and comment. The user can recover the content of a file not encoded in UTF-8 but can't do anything for the file name. This new option adds the ability to decode file names and comments with a custom algorithm. See #210. --- documentation/api_jszip/load.md | 24 ++++++++++++++++++++++++ lib/load.js | 14 +++++++++++--- lib/object.js | 24 +++--------------------- lib/utils.js | 18 ++++++++++++++++++ lib/zipEntries.js | 8 +++++--- lib/zipEntry.js | 27 ++++++++++++++++++--------- test/ref/local_encoding_in_name.zip | Bin 0 -> 350 bytes test/test.js | 27 ++++++++++++++++++++++++++- 8 files changed, 105 insertions(+), 37 deletions(-) create mode 100644 test/ref/local_encoding_in_name.zip diff --git a/documentation/api_jszip/load.md b/documentation/api_jszip/load.md index a2f54ad1..8d9591c2 100644 --- a/documentation/api_jszip/load.md +++ b/documentation/api_jszip/load.md @@ -23,6 +23,7 @@ options.base64 | boolean | false | set to `true` if the data is options.checkCRC32 | boolean | false | set to `true` if the read data should be checked against its CRC32. options.optimizedBinaryString | boolean | false | set to true if (and only if) the input is a string and has already been prepared with a 0xFF mask. options.createFolders | boolean | false | set to true to create folders in the file path automatically. Leaving it false will result in only virtual folders (i.e. folders that merely represent part of the file path) being created. +options.decodeFileName | function | decode from UTF-8 | the function to decode the file name / comment. You shouldn't update the data given to this method : it is kept as it so any update will impact the stored data. @@ -39,6 +40,16 @@ Zip features not (yet) supported : * password protected zip * multi-volume zip + +__About `decodeFileName`__ : + +A zip file has a flag to say if the filename and comment are encoded with UTF-8. +If it's not set, JSZip has **no way** to know the encoding used. It usually +is the default encoding of the operating system. + +The function takes the bytes array (Uint8Array or Array) and returns the +decoded string. + __Returns__ : The current JSZip object. __Throws__ : An exception if the loaded data is not valid zip data or if it @@ -79,3 +90,16 @@ zip.folder("subfolder").load(data); // the content of data will be loaded in subfolder/ ``` +Using a custom charset : + +```js +// using iconv-lite for example +var iconv = require('iconv-lite'); + +zip.load(content, { + decodeFileName: function (bytes) { + return iconv.decode(bytes, 'your-encoding'); + } +}); +``` + diff --git a/lib/load.js b/lib/load.js index e0031101..09d8db88 100644 --- a/lib/load.js +++ b/lib/load.js @@ -1,9 +1,17 @@ 'use strict'; var base64 = require('./base64'); +var utf8 = require('./utf8'); +var utils = require('./utils'); var ZipEntries = require('./zipEntries'); module.exports = function(data, options) { var files, zipEntries, i, input; - options = options || {}; + options = utils.extend(options || {}, { + base64: false, + checkCRC32: false, + optimizedBinaryString : false, + createFolders: false, + decodeFileName: utf8.utf8decode + }); if (options.base64) { data = base64.decode(data); } @@ -12,12 +20,12 @@ module.exports = function(data, options) { files = zipEntries.files; for (i = 0; i < files.length; i++) { input = files[i]; - this.file(input.fileName, input.decompressed, { + this.file(input.fileNameStr, input.decompressed, { binary: true, optimizedBinaryString: true, date: input.date, dir: input.dir, - comment : input.fileComment.length ? input.fileComment : null, + comment : input.fileCommentStr.length ? input.fileCommentStr : null, unixPermissions : input.unixPermissions, dosPermissions : input.dosPermissions, createFolders: options.createFolders diff --git a/lib/object.js b/lib/object.js index 3b5e5b8b..03c47fc3 100644 --- a/lib/object.js +++ b/lib/object.js @@ -173,24 +173,6 @@ var decToHex = function(dec, bytes) { return hex; }; -/** - * Merge the objects passed as parameters into a new one. - * @private - * @param {...Object} var_args All objects to merge. - * @return {Object} a new object with the data of the others. - */ -var extend = function() { - var result = {}, i, attr; - for (i = 0; i < arguments.length; i++) { // arguments is not enumerable in some browsers - for (attr in arguments[i]) { - if (arguments[i].hasOwnProperty(attr) && typeof result[attr] === "undefined") { - result[attr] = arguments[i][attr]; - } - } - } - return result; -}; - /** * Transforms the (incomplete) options from the user into the complete * set of options to create a file. @@ -203,7 +185,7 @@ var prepareFileAttrs = function(o) { if (o.base64 === true && (o.binary === null || o.binary === undefined)) { o.binary = true; } - o = extend(o, defaults); + o = utils.extend(o, defaults); o.date = o.date || new Date(); if (o.compression !== null) o.compression = o.compression.toUpperCase(); @@ -634,7 +616,7 @@ var out = { } file = this.files[filename]; // return a new object, don't let the user mess with our internal objects :) - fileClone = new ZipObject(file.name, file._data, extend(file.options)); + fileClone = new ZipObject(file.name, file._data, utils.extend(file.options)); relativePath = filename.slice(this.root.length, filename.length); if (filename.slice(0, this.root.length) === this.root && // the file is in the current root search(relativePath, fileClone)) { // and the file matches the function @@ -741,7 +723,7 @@ var out = { * @return {String|Uint8Array|ArrayBuffer|Buffer|Blob} the zip file */ generate: function(options) { - options = extend(options || {}, { + options = utils.extend(options || {}, { base64: true, compression: "STORE", compressionOptions : null, diff --git a/lib/utils.js b/lib/utils.js index da54747a..779ac886 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -324,3 +324,21 @@ exports.isRegExp = function (object) { return Object.prototype.toString.call(object) === "[object RegExp]"; }; +/** + * Merge the objects passed as parameters into a new one. + * @private + * @param {...Object} var_args All objects to merge. + * @return {Object} a new object with the data of the others. + */ +exports.extend = function() { + var result = {}, i, attr; + for (i = 0; i < arguments.length; i++) { // arguments is not enumerable in some browsers + for (attr in arguments[i]) { + if (arguments[i].hasOwnProperty(attr) && typeof result[attr] === "undefined") { + result[attr] = arguments[i][attr]; + } + } + } + return result; +}; + diff --git a/lib/zipEntries.js b/lib/zipEntries.js index 4b825613..dc51fb81 100644 --- a/lib/zipEntries.js +++ b/lib/zipEntries.js @@ -48,10 +48,12 @@ ZipEntries.prototype = { // warning : the encoding depends of the system locale // On a linux machine with LANG=en_US.utf8, this field is utf8 encoded. // On a windows machine, this field is encoded with the localized windows code page. - this.zipComment = this.reader.readString(this.zipCommentLength); + var zipComment = this.reader.readData(this.zipCommentLength); + var decodeParamType = support.uint8array ? "uint8array" : "array"; // To get consistent behavior with the generation part, we will assume that - // this is utf8 encoded. - this.zipComment = jszipProto.utf8decode(this.zipComment); + // this is utf8 encoded unless specified otherwise. + var decodeContent = utils.transformTo(decodeParamType, zipComment); + this.zipComment = this.loadOptions.decodeFileName(decodeContent); }, /** * Read the end of the Zip 64 central directory. diff --git a/lib/zipEntry.js b/lib/zipEntry.js index 70a3ac0a..73564f31 100644 --- a/lib/zipEntry.js +++ b/lib/zipEntry.js @@ -3,6 +3,7 @@ var StringReader = require('./stringReader'); var utils = require('./utils'); var CompressedObject = require('./compressedObject'); var jszipProto = require('./object'); +var support = require('./support'); var MADE_BY_DOS = 0x00; var MADE_BY_UNIX = 0x03; @@ -100,7 +101,7 @@ ZipEntry.prototype = { // Unfortunately, this lead also to some issues : http://seclists.org/fulldisclosure/2009/Sep/394 this.fileNameLength = reader.readInt(2); localExtraFieldsLength = reader.readInt(2); // can't be sure this will be the same as the central dir - this.fileName = reader.readString(this.fileNameLength); + this.fileName = reader.readData(this.fileNameLength); reader.skip(localExtraFieldsLength); if (this.compressedSize == -1 || this.uncompressedSize == -1) { @@ -109,7 +110,7 @@ ZipEntry.prototype = { compression = utils.findCompression(this.compressionMethod); if (compression === null) { // no compression found - throw new Error("Corrupted zip : compression " + utils.pretty(this.compressionMethod) + " unknown (inner file : " + this.fileName + ")"); + throw new Error("Corrupted zip : compression " + utils.pretty(this.compressionMethod) + " unknown (inner file : " + utils.transformTo("string", this.fileName) + ")"); } this.decompressed = new CompressedObject(); this.decompressed.compressedSize = this.compressedSize; @@ -153,10 +154,10 @@ ZipEntry.prototype = { throw new Error("Encrypted zip are not supported"); } - this.fileName = reader.readString(this.fileNameLength); + this.fileName = reader.readData(this.fileNameLength); this.readExtraFields(reader); this.parseZIP64ExtraField(reader); - this.fileComment = reader.readString(this.fileCommentLength); + this.fileComment = reader.readData(this.fileCommentLength); }, /** @@ -183,7 +184,7 @@ ZipEntry.prototype = { } // fail safe : if the name ends with a / it probably means a folder - if (!this.dir && this.fileName.slice(-1) === '/') { + if (!this.dir && this.fileNameStr.slice(-1) === '/') { this.dir = true; } }, @@ -244,17 +245,25 @@ ZipEntry.prototype = { * Apply an UTF8 transformation if needed. */ handleUTF8: function() { + var decodeParamType = support.uint8array ? "uint8array" : "array"; if (this.useUTF8()) { - this.fileName = jszipProto.utf8decode(this.fileName); - this.fileComment = jszipProto.utf8decode(this.fileComment); + this.fileNameStr = jszipProto.utf8decode(this.fileName); + this.fileCommentStr = jszipProto.utf8decode(this.fileComment); } else { var upath = this.findExtraFieldUnicodePath(); if (upath !== null) { - this.fileName = upath; + this.fileNameStr = upath; + } else { + var fileNameByteArray = utils.transformTo(decodeParamType, this.fileName); + this.fileNameStr = this.loadOptions.decodeFileName(fileNameByteArray); } + var ucomment = this.findExtraFieldUnicodeComment(); if (ucomment !== null) { - this.fileComment = ucomment; + this.fileCommentStr = ucomment; + } else { + var commentByteArray = utils.transformTo(decodeParamType, this.fileComment); + this.fileCommentStr = this.loadOptions.decodeFileName(commentByteArray); } } }, diff --git a/test/ref/local_encoding_in_name.zip b/test/ref/local_encoding_in_name.zip new file mode 100644 index 0000000000000000000000000000000000000000..9ebae32df80c0d940797b55947a96b85f8458ed8 GIT binary patch literal 350 zcmWIWW@Zs#0D)(-d)>ecD8U1yd)F;m@Lpm4g7vEw=m!8*a)IU0RcfHA1nPRdQsL3k zRSzG5SxeTfdc0=o+DCdN6(s@Qj7;_nxNKB`+ARR3k?rSVPymZEGDtAUrkOjimtA0l v(T%1@Y(ad8>9|}8GaY1(77!zwj?c~bY=*fYz?+o~ Date: Thu, 16 Apr 2015 20:57:23 +0200 Subject: [PATCH 2/2] Add encodeFileName in generate(). This option allows the user to generate a zip file with a custom encoding for the entry file name / comment. See also "decodeFileName" in load(). Not using UTF-8 when generating a zip file is maybe asking for troubles but some archive managers (Windows Compressed Folders for example) don't support well unicode stuff. See #210. --- documentation/api_jszip/generate.md | 22 +++++++++++++++++ documentation/limitations.md | 23 ++++++++++++++---- lib/object.js | 33 ++++++++++++++----------- test/test.js | 37 +++++++++++++++++++++-------- 4 files changed, 87 insertions(+), 28 deletions(-) diff --git a/documentation/api_jszip/generate.md b/documentation/api_jszip/generate.md index 7b00f97e..328678a6 100644 --- a/documentation/api_jszip/generate.md +++ b/documentation/api_jszip/generate.md @@ -18,6 +18,7 @@ options.type | string | `base64` | The type of zip to return, see below options.comment | string | | The comment to use for the zip file. options.mimeType | string | `application/zip` | mime-type for the generated file. Useful when you need to generate a file with a different extension, ie: ".ods". options.platform | string | `DOS` | The platform to use when generating the zip file. +options.encodeFileName | function | encode with UTF-8 | the function to encode the file name / comment. Possible values for `type` : @@ -58,6 +59,13 @@ If you set the platform value on nodejs, be sure to use `process.platform`. force the platform to `UNIX` the generated zip file will have a strange behavior on UNIX platforms. +__About `encodeFileName`__ : + +By default, JSZip uses UTF-8 to encode the file names / comments. You can use +this method to force an other encoding. Note : the encoding used is not stored +in a zip file, not using UTF-8 may lead to encoding issues. +The function takes a string and returns a bytes array (Uint8Array or Array). + __Returns__ : The generated zip file. __Throws__ : An exception if the asked `type` is not available in the browser, @@ -137,3 +145,17 @@ link.href = url; ``` +Using a custom charset : + +```js +// using iconv-lite for example +var iconv = require('iconv-lite'); + +zip.generate({ + type: 'uint8array', + encodeFileName: function (string) { + return iconv.encode(string, 'your-encoding'); + } +}); +``` + diff --git a/documentation/limitations.md b/documentation/limitations.md index 62db22e5..b05df28e 100644 --- a/documentation/limitations.md +++ b/documentation/limitations.md @@ -67,7 +67,22 @@ Some data are discarded (file metadata) and other are added (subfolders). ### Encodings support -JSZip only supports utf8 : if the names of the files inside the zip are not in -utf8 (or ASCII), they won't be interpreted correctly. If the content is a text -not encoded with utf8 (or ASCII), the `asText()` method won't decode it -correctly. +JSZip only supports UTF-8 natively. A zip file doesn't contain the name of the +encoding used, you need to know it before doing anything. + +#### File name + +If the name of a file inside the zip is encoded with UTF-8 then JSZip can +detect it (Language encoding flag, Unicode Path Extra Field). If not, JSZip +can't detect the encoding used and will generate [Mojibake](https://en.wikipedia.org/wiki/Mojibake). +You can use the [encodeFileName]({{site.baseurl}}/documentation/api_jszip/generate.html) +option and the [decodeFileName]({{site.baseurl}}/documentation/api_jszip/load.html) +option to encode/decode using a custom encoding. + +#### File content + +The `asText()` method uses UTF-8 to decode the content. If you have a text in +a different encoding, you can get the bytes array with `asUint8Array()` and +decode it with a lib (iconv, iconv-lite, etc) on your side. +To save a text using a non-UTF-8 encoding, do the same : encode it into a +Uint8Array before adding it to JSZip. diff --git a/lib/object.js b/lib/object.js index 03c47fc3..7b99bb11 100644 --- a/lib/object.js +++ b/lib/object.js @@ -420,12 +420,16 @@ var generateDosExternalFileAttr = function (dosPermissions, isDir) { * @param {JSZip.CompressedObject} compressedObject the compressed object. * @param {number} offset the current offset from the start of the zip file. * @param {String} platform let's pretend we are this platform (change platform dependents fields) + * @param {Function} encodeFileName the function to encode the file name / comment. * @return {object} the zip parts. */ -var generateZipParts = function(name, file, compressedObject, offset, platform) { +var generateZipParts = function(name, file, compressedObject, offset, platform, encodeFileName) { var data = compressedObject.compressedContent, + useCustomEncoding = encodeFileName !== utf8.utf8encode, + encodedFileName = utils.transformTo("string", encodeFileName(file.name)), utfEncodedFileName = utils.transformTo("string", utf8.utf8encode(file.name)), comment = file.comment || "", + encodedComment = utils.transformTo("string", encodeFileName(comment)), utfEncodedComment = utils.transformTo("string", utf8.utf8encode(comment)), useUTF8ForFileName = utfEncodedFileName.length !== file.name.length, useUTF8ForComment = utfEncodedComment.length !== comment.length, @@ -497,7 +501,7 @@ var generateZipParts = function(name, file, compressedObject, offset, platform) // Version decToHex(1, 1) + // NameCRC32 - decToHex(crc32(utfEncodedFileName), 4) + + decToHex(crc32(encodedFileName), 4) + // UnicodeName utfEncodedFileName; @@ -516,7 +520,7 @@ var generateZipParts = function(name, file, compressedObject, offset, platform) // Version decToHex(1, 1) + // CommentCRC32 - decToHex(this.crc32(utfEncodedComment), 4) + + decToHex(this.crc32(encodedComment), 4) + // UnicodeName utfEncodedComment; @@ -535,7 +539,7 @@ var generateZipParts = function(name, file, compressedObject, offset, platform) header += "\x0A\x00"; // general purpose bit flag // set bit 11 if utf8 - header += (useUTF8ForFileName || useUTF8ForComment) ? "\x00\x08" : "\x00\x00"; + header += !useCustomEncoding && (useUTF8ForFileName || useUTF8ForComment) ? "\x00\x08" : "\x00\x00"; // compression method header += compressedObject.compressionMethod; // last mod file time @@ -549,12 +553,12 @@ var generateZipParts = function(name, file, compressedObject, offset, platform) // uncompressed size header += decToHex(compressedObject.uncompressedSize, 4); // file name length - header += decToHex(utfEncodedFileName.length, 2); + header += decToHex(encodedFileName.length, 2); // extra field length header += decToHex(extraFields.length, 2); - var fileRecord = signature.LOCAL_FILE_HEADER + header + utfEncodedFileName + extraFields; + var fileRecord = signature.LOCAL_FILE_HEADER + header + encodedFileName + extraFields; var dirRecord = signature.CENTRAL_FILE_HEADER + // version made by (00: DOS) @@ -562,7 +566,7 @@ var generateZipParts = function(name, file, compressedObject, offset, platform) // file header (common to file and central directory) header + // file comment length - decToHex(utfEncodedComment.length, 2) + + decToHex(encodedComment.length, 2) + // disk number start "\x00\x00" + // internal file attributes TODO @@ -572,11 +576,11 @@ var generateZipParts = function(name, file, compressedObject, offset, platform) // relative offset of local header decToHex(offset, 4) + // file name - utfEncodedFileName + + encodedFileName + // extra field extraFields + // file comment - utfEncodedComment; + encodedComment; return { fileRecord: fileRecord, @@ -730,7 +734,8 @@ var out = { type: "base64", platform: "DOS", comment: null, - mimeType: 'application/zip' + mimeType: 'application/zip', + encodeFileName: utf8.utf8encode }); utils.checkSupport(options.type); @@ -752,7 +757,7 @@ var out = { localDirLength = 0, centralDirLength = 0, writer, i, - utfEncodedComment = utils.transformTo("string", this.utf8encode(options.comment || this.comment || "")); + encodedComment = utils.transformTo("string", options.encodeFileName(options.comment || this.comment || "")); // first, generate all the zip parts. for (var name in this.files) { @@ -770,7 +775,7 @@ var out = { var compressedObject = generateCompressedObjectFrom.call(this, file, compression, compressionOptions); - var zipPart = generateZipParts.call(this, name, file, compressedObject, localDirLength, options.platform); + var zipPart = generateZipParts.call(this, name, file, compressedObject, localDirLength, options.platform, options.encodeFileName); localDirLength += zipPart.fileRecord.length + compressedObject.compressedSize; centralDirLength += zipPart.dirRecord.length; zipData.push(zipPart); @@ -793,9 +798,9 @@ var out = { // offset of start of central directory with respect to the starting disk number decToHex(localDirLength, 4) + // .ZIP file comment length - decToHex(utfEncodedComment.length, 2) + + decToHex(encodedComment.length, 2) + // .ZIP file comment - utfEncodedComment; + encodedComment; // we have all the parts (and the total length) diff --git a/test/test.js b/test/test.js index 908b99f8..8a7432a0 100644 --- a/test/test.js +++ b/test/test.js @@ -167,23 +167,40 @@ testZipFile("Zip text file with non unicode characters in filename", "ref/local_ ok(!zipUnicode.files["Новая папка/Новый текстовый документ.txt"], "default : the file is not found"); var conversions = { - "bytes 8d ae a2 a0 ef 20 af a0 af aa a0 2f" : "Новая папка/", - "bytes 8d ae a2 a0 ef 20 af a0 af aa a0 2f 8d ae a2 eb a9 20 e2 a5 aa e1 e2 ae a2 eb a9 20 a4 ae aa e3 ac a5 ad e2 2e 74 78 74" : "Новая папка/Новый текстовый документ.txt" + "": [], + "Новая папка/": [0x8d, 0xae, 0xa2, 0xa0, 0xef, 0x20, 0xaf, 0xa0, 0xaf, 0xaa, 0xa0, 0x2f], + "Новая папка/Новый текстовый документ.txt": [0x8d, 0xae, 0xa2, 0xa0, 0xef, 0x20, 0xaf, 0xa0, 0xaf, 0xaa, 0xa0, 0x2f, 0x8d, 0xae, 0xa2, 0xeb, 0xa9, 0x20, 0xe2, 0xa5, 0xaa, 0xe1, 0xe2, 0xae, 0xa2, 0xeb, 0xa9, 0x20, 0xa4, 0xae, 0xaa, 0xe3, 0xac, 0xa5, 0xad, 0xe2, 0x2e, 0x74, 0x78, 0x74] }; - var zipCP866 = new JSZip(content, { - decodeFileName: function (bytes) { - // here, a real iconv implementation - var key = "bytes"; - for(var i = 0; i < bytes.length; i++) { - key += " " + bytes[i].toString(16); + function decodeCP866(bytes) { + for(var text in conversions) { + if (conversions[text].length === bytes.length) { + return text; } - - return conversions[key] || ""; } + } + function encodeCP866(string) { + return conversions[string]; + } + var zipCP866 = new JSZip(content, { + decodeFileName: decodeCP866 }); ok(zipCP866.files["Новая папка/"], "with decodeFileName : the folder has been correctly read"); ok(zipCP866.files["Новая папка/Новый текстовый документ.txt"], "with decodeFileName : the file has been correctly read"); + + var newZip = zipCP866.generate({ + type:"string", + encodeFileName: encodeCP866 + }); + // the example zip doesn't contain the unicode path extra field, we can't + // compare them. + + var zipCP866Reloaded = new JSZip(newZip, { + decodeFileName: decodeCP866 + }); + + ok(zipCP866Reloaded.files["Новая папка/"], "reloaded, with decodeFileName : the folder has been correctly read"); + ok(zipCP866Reloaded.files["Новая папка/Новый текстовый документ.txt"], "reloaded, with decodeFileName : the file has been correctly read"); }); // zip -X -0 pile_of_poo.zip Iñtërnâtiônàlizætiøn☃💩.txt