Opening and destroying the same entry causes high memory usage #87

noreng · 2018-05-28T13:35:29Z

I need to read the first row of a zipped csv file to create a table in a postgresql database, and then pg-copy the whole csv file to the table.

I open the same zip entry twice with zipFile.openReadStream. After I read the first chunk, I destroy the stream (with readStream.destroy()) and I open it again. Everything works as I expected, except the memoryUsage:
I have a ~1,6 GB csv file compressed to a 110 MB zip. When I open the entry for the first time, the max memory usage (RSS) is 120 MB. After re-opening the same entry, the RSS goes up high, from 120 MB to 1,7 GB. If I don't destroy the entry on the first run, and let it finish, the max RSS is 150 MB and it stays this low after I reopen the entry, so I don't think the problem is with re-opening the entry. There must be wrong how I destroy it.

My system: Windows 10, Node 10.

The excerpt of my code:

const yauzl = require('yauzl');
const fs = require('fs');
const promisify = require('es6-promisify');

(async function () {
    const fileBuffer = await promisify(fs.readFile)('./large.zip');

    console.log('started');
    const yauzlFromBuffer = promisify(yauzl.fromBuffer);
    const zipfile = await yauzlFromBuffer(fileBuffer, { lazyEntries: true });

    const openReadStream = promisify(zipfile.openReadStream.bind(zipfile));
    zipfile.readEntry();

    zipfile.on('entry', async (entry) => {
        console.log('1) read the entry');
        const stream = await openReadStream(entry);
        // read the first chunk only
        const justFirstChunk = true;
        await readEntry(stream, entry.uncompressedSize, justFirstChunk);

        // This is where I handle the first row

        console.log('2) read the same entry again');
        const stream2 = await openReadStream(entry);
        await readEntry(stream2, entry.uncompressedSize);
        zipfile.readEntry();
    });

    await new Promise(resolve => {
        zipfile.on('end', () => {
            console.log('finished');
            resolve();
        });
    });
}());

async function readEntry(readStream, entrySize, justFirstChunk) {
    logMemoryUsage();
    const onData = getProgressHandler(entrySize);
    await new Promise((resolve, reject) => {
        readStream
            .on('error', reject)
            .on('data', chunk => {
                // destroy the stream after the first chunk
                if (justFirstChunk) {
                    readStream.destroy();
                    console.log('readEntry destroyed');
                    logMemoryUsage();
                    resolve();
                } else {
                    onData(chunk);
                }
            })
            .on('end', () => {
                console.log('readEntry end');
                resolve();
            });
    });
}

function getProgressHandler(entrySize) {
    let bytes = 0;
    let step = 0.1;
    let nextProgressStep = bytes + step;
    return function (chunk) {
        bytes += chunk.length;
        const progress = Math.round(bytes / entrySize * 100) / 100;
        if (progress >= nextProgressStep) {
            console.log(progress);
            logMemoryUsage();
            nextProgressStep = ((nextProgressStep / step) + 1) * step;
        }
    };
}

function logMemoryUsage() {
    const memoryMB = Math.round(process.memoryUsage().rss / (1024 * 1024));
    console.log('memoryUsage:', memoryMB, 'MB');
}

Memory usage:

started                      
1) read the entry            
memoryUsage: 121 MB          
readEntry destroyed          
memoryUsage: 121 MB          
2) read the same entry again 
memoryUsage: 121 MB          
0.1                          
memoryUsage: 311 MB          
0.2                          
memoryUsage: 479 MB          
0.31                         
memoryUsage: 669 MB          
0.4                          
memoryUsage: 818 MB          
0.5                          
memoryUsage: 982 MB          
0.61                         
memoryUsage: 1157 MB         
0.71                         
memoryUsage: 1339 MB         
0.81                         
memoryUsage: 1506 MB         
0.91                         
memoryUsage: 1672 MB         
readEntry end                
finished

The text was updated successfully, but these errors were encountered:

thejoshwolfe · 2018-05-28T16:19:02Z

I've reproduced the issue. I'll look into this and get back to you.

thejoshwolfe · 2018-06-03T20:13:13Z

i was able to resolve the issue by modifying the BufferSlicer.prototype.createReadStream implementation in yauzl's fd-slicer dependency. It appears that the problem comes from giving zlib.createInflateRaw() stream a very large buffer in one chunk. If you cut up the buffer into 0x10000 byte chunks, there is no problem.

I'll make a PR against fd-slicer with this feature, and then update yauzl to depend on it.

thejoshwolfe · 2018-06-03T23:31:17Z

published yauzl 2.9.2 that should fix this issue. give it a try and let me know what you observe.

noreng · 2018-06-04T08:30:54Z

I tested and it works fine, even with larger files (tested with a 300 MB zip). The memory usage is steady and low. Many thanks!

thejoshwolfe mentioned this issue Jun 3, 2018

add maxChunkSize option andrewrk/node-fd-slicer#1

Merged

thejoshwolfe closed this as completed in 12d0bc6 Jun 3, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Opening and destroying the same entry causes high memory usage #87

Opening and destroying the same entry causes high memory usage #87

Opening and destroying the same entry causes high memory usage #87

Opening and destroying the same entry causes high memory usage #87

Comments