SheetJS / sheetjs

📗 SheetJS Spreadsheet Data Toolkit -- New home https://git.sheetjs.com/SheetJS/sheetjs

Home Page:https://sheetjs.com/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

parser does not support ZIP64 archives

silvialeung opened this issue · comments

I try to read a programmatically generated xlsx file (which can be open in MS Excel) but it throws a Nodejs memory overflow error despite the file being very small (5KB).
The file can pass the parsing when resaving it in MS Excel though.

code:

const workbook = XLSX.readFile(
        "test_20220812111206.xlsx"
   );

error: (file path is partially hidden)

[app] 2022-08-12 11:12 error: Error: Cannot create a string longer than 0x1fffffe8 characters
    at Object.slice (node:buffer:621:37)
    at Buffer.toString (node:buffer:812:14)
    at cc2str (/node_modules/xlsx/xlsx.js:3272:14)
    at getdatastr (/node_modules/xlsx/xlsx.js:3348:39)
    at getzipstr (/node_modules/xlsx/xlsx.js:3396:19)
    at parse_zip (/node_modules/xlsx/xlsx.js:23118:22)
    at read_zip (/node_modules/xlsx/xlsx.js:23644:9)
    at readSync (/node_modules/xlsx/xlsx.js:23715:69)
    at Object.readFileSync (/node_modules/xlsx/xlsx.js:23738:9)
    at/src/controllers/Import.ts:57:29
    at Generator.next (<anonymous>)
    at fulfilled (/src/controllers/Import.ts:5:58)
    at processTicksAndRejections (node:internal/process/task_queues:96:5) {
  code: 'ERR_STRING_TOO_LONG'
}

Further tracing reveals that the code fails when getting [Content_Types].xml string. I compared the generated excel's and the "resaved in MS Excel application"'s [Content_Types].xml. The differences are as below:

  1. There is an extra line in generated excel.
<Override PartName="/xl/theme/theme1.xml" ContentType="application/vnd.openxmlformats-officedocument.theme+xml" />
  1. There is an standalone attribute in <?xml> line in generated excel
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

The file that causes the issue:
test_20220812111206.xlsx

Thanks for sharing! This is a ZIP issue. The writer used the ZIP64 extension, which is not currently supported in the cfb library

Thank you for the quick reply. Is there any workaround for this issue currently?

@silvialeung which version of NPOI is used to generate this xlsx file?

To wrap up this issue on the SheetJS side:

  1. Reading the ZIP64 extended length info requires a change to bits/18_cfb.js:
--- a/bits/18_cfb.js
+++ b/bits/18_cfb.js
@@ -211,8 +211,15 @@ function parse_extra_field(blob/*:CFBlob*/)/*:any*/ {
                                        if(flags & 4) p.ctime = blob.read_shift(4);
                                }
                                if(p.mtime) p.mt = new Date(p.mtime*1000);
-                       }
-                       break;
+                       } break;
+                       /* ZIP64 Extended Information Field */
+                       case 0x0001: {
+                               var sz1 = blob.read_shift(4), sz2 = blob.read_shift(4);
+                               p.usz = (sz2 * Math.pow(2,32) + sz1);
+                               sz1 = blob.read_shift(4); sz2 = blob.read_shift(4);
+                               p.csz = (sz2 * Math.pow(2,32) + sz1);
+                               // NOTE: volume fields are skipped
+                       } break;
                }
                blob.l = tgt;
                o[type] = p;
@@ -1401,6 +1408,11 @@ function parse_zip(file/*:RawBytes*/, options/*:CFBReadOpts*/)/*:CFBContainer*/
 
                var L = blob.l;
                blob.l = offset + 4;
+               /* ZIP64 lengths */
+               if(EF && EF[0x0001]) {
+                       if((EF[0x0001]||{}).usz) usz = EF[0x0001].usz;
+                       if((EF[0x0001]||{}).csz) csz = EF[0x0001].csz;
+               }
                parse_local_file(blob, csz, usz, o, EF);
                blob.l = L;
        }
@@ -1430,7 +1442,13 @@ function parse_local_file(blob/*:CFBlob*/, csz/*:number*/, usz/*:number*/, o/*:C
        if(efsz) {
                var ef = parse_extra_field(/*::(*/blob.slice(blob.l, blob.l + efsz)/*:: :any)*/);
                if((ef[0x5455]||{}).mt) date = ef[0x5455].mt;
-               if(((EF||{})[0x5455]||{}).mt) date = EF[0x5455].mt;
+               if((ef[0x0001]||{}).usz) _usz = ef[0x0001].usz;
+               if((ef[0x0001]||{}).csz) _csz = ef[0x0001].csz;
+               if(EF) {
+                       if((EF[0x5455]||{}).mt) date = EF[0x5455].mt;
+                       if((EF[0x0001]||{}).usz) _usz = ef[0x0001].usz;
+                       if((EF[0x0001]||{}).csz) _csz = ef[0x0001].csz;
+               }
        }
        blob.l += efsz;
  1. The test file uses a custom namespace in the custom properties, requiring a change to bits/35_custprops.js:
--- a/bits/35_custprops.js
+++ b/bits/35_custprops.js
@@ -5,7 +5,7 @@ function parse_cust_props(data/*:string*/, opts) {
        var m = data.match(custregex);
        if(m) for(var i = 0; i != m.length; ++i) {
                var x = m[i], y = parsexmltag(x);
-               switch(y[0]) {
+               switch(strip_ns(y[0])) {
                        case '<?xml': break;
                        case '<Properties': break;
                        case '<property': name = unescapexml(y.name); break;

(we'll accept a PR, as well as a similar PR for https://github.com/SheetJS/js-cfb )

NPOI

@tonyqus Sorry but the version is unknown. The excel was downloaded from a third party private software.

To wrap up this issue on the SheetJS side:

  1. Reading the ZIP64 extended length info requires a change to bits/18_cfb.js:
--- a/bits/18_cfb.js
+++ b/bits/18_cfb.js
@@ -211,8 +211,15 @@ function parse_extra_field(blob/*:CFBlob*/)/*:any*/ {
                                        if(flags & 4) p.ctime = blob.read_shift(4);
                                }
                                if(p.mtime) p.mt = new Date(p.mtime*1000);
-                       }
-                       break;
+                       } break;
+                       /* ZIP64 Extended Information Field */
+                       case 0x0001: {
+                               var sz1 = blob.read_shift(4), sz2 = blob.read_shift(4);
+                               p.usz = (sz2 * Math.pow(2,32) + sz1);
+                               sz1 = blob.read_shift(4); sz2 = blob.read_shift(4);
+                               p.csz = (sz2 * Math.pow(2,32) + sz1);
+                               // NOTE: volume fields are skipped
+                       } break;
                }
                blob.l = tgt;
                o[type] = p;
@@ -1401,6 +1408,11 @@ function parse_zip(file/*:RawBytes*/, options/*:CFBReadOpts*/)/*:CFBContainer*/
 
                var L = blob.l;
                blob.l = offset + 4;
+               /* ZIP64 lengths */
+               if(EF && EF[0x0001]) {
+                       if((EF[0x0001]||{}).usz) usz = EF[0x0001].usz;
+                       if((EF[0x0001]||{}).csz) csz = EF[0x0001].csz;
+               }
                parse_local_file(blob, csz, usz, o, EF);
                blob.l = L;
        }
@@ -1430,7 +1442,13 @@ function parse_local_file(blob/*:CFBlob*/, csz/*:number*/, usz/*:number*/, o/*:C
        if(efsz) {
                var ef = parse_extra_field(/*::(*/blob.slice(blob.l, blob.l + efsz)/*:: :any)*/);
                if((ef[0x5455]||{}).mt) date = ef[0x5455].mt;
-               if(((EF||{})[0x5455]||{}).mt) date = EF[0x5455].mt;
+               if((ef[0x0001]||{}).usz) _usz = ef[0x0001].usz;
+               if((ef[0x0001]||{}).csz) _csz = ef[0x0001].csz;
+               if(EF) {
+                       if((EF[0x5455]||{}).mt) date = EF[0x5455].mt;
+                       if((EF[0x0001]||{}).usz) _usz = ef[0x0001].usz;
+                       if((EF[0x0001]||{}).csz) _csz = ef[0x0001].csz;
+               }
        }
        blob.l += efsz;
  1. The test file uses a custom namespace in the custom properties, requiring a change to bits/35_custprops.js:
--- a/bits/35_custprops.js
+++ b/bits/35_custprops.js
@@ -5,7 +5,7 @@ function parse_cust_props(data/*:string*/, opts) {
        var m = data.match(custregex);
        if(m) for(var i = 0; i != m.length; ++i) {
                var x = m[i], y = parsexmltag(x);
-               switch(y[0]) {
+               switch(strip_ns(y[0])) {
                        case '<?xml': break;
                        case '<Properties': break;
                        case '<property': name = unescapexml(y.name); break;

(we'll accept a PR, as well as a similar PR for https://github.com/SheetJS/js-cfb )

@SheetJSDev
Is it worked??

This commit was part of 0.18.11 and it should work with the current release (0.18.12). See https://docs.sheetjs.com/docs/getting-started/#installation for installation instructions for your deployment

@Santosh1652 can you share the file?