Platforms: Windows and Linux
Everything is fine as long as I use 10.7.0 or older. The problem starts when I go to 10.8.0 or newer.
I created a test project that reproduces the issue. It
What you will see is that the write stream for the binary file gets the last chunk from the previous text file's read stream.
The read stream SHA-256 still is good, however, but the write stream's SHA-256 is not. The problem only seems to happen with binary files.
Test project output:
TEXT FILE TEST
TEXT CHUNK 5.3 Somatosensory Neurons have Receptive FieldsEach subcortical somatose... 65248
TEXT CHUNK ed the neuron’sreceptive field (Figure 5.3). The neuron’s receptive field ... 65242
TEXT CHUNK ed/represented. The receptive fields of neuronsinnervating/representing th... 4510
Original text file hash: 9101a84eb2320001628926e3c4decd09eff6680809e95a920db4e18afbcf0201
Read file calculated hash: 9101a84eb2320001628926e3c4decd09eff6680809e95a920db4e18afbcf0201
Write file calculated hash: 9101a84eb2320001628926e3c4decd09eff6680809e95a920db4e18afbcf0201
BINARY FILE TEST
BINARY CHUNK ed/represented. The receptive fields of neuronsinnervating/representing th.. 61008
BINARY CHUNK m5^gY5y"$ch86m&KmO}bD,P}r"sySZ6... 65536
BINARY CHUNK $J|0sd\hsrdh9Tn"?x<0ruZ3KD... 65536
BINARY CHUNK !>B2}REF7VEdK%Er(*$RuG"U$... 65536
...(a few more binary chunks)
BINARY CHUNK 1NI+`9sHRT)r.V%C&*"#*F_6HKw`... 47536
Original binary file hash: ee1758d957bac3706b6a0ad450ffaeab34d55d5c6d5988a8bef7ce72c8a7db85
Read file calculated hash: ee1758d957bac3706b6a0ad450ffaeab34d55d5c6d5988a8bef7ce72c8a7db85
Write file calculated hash: a9d6ee259f8d6315e5e59db7920d6389d6ebd60618d4a49a8550f8b3e340f06e
This issue tracker is for reporting bugs in node core and submitting feature requests for node core.
General help questions should be posted to the nodejs/help issue tracker instead.
Issues with third-party modules, npm, or other tools that use node, should be posted to the appropriate issue tracker for that project, unless it can be proven that the issue is in fact with node core and not the module/tool in question.
How is this a HELP question????
Or at least tell me why this is supposed to be a "help request", because I'm not actually feeling it, I mean that I'm asking for anyone's help.
test.tar.gz
Can I ask that you either copy the code into comments here or into a gist. I (and I know others here) are generally unwilling to download an opaque tar ball...
Also, I assume you mean Node.js 10.8.0 and not 10.0.8?
EDIT: type annotations removed
Test file 1: streams
'use strict';
const path = require('path');
const fs = require('fs');
const crypto = require('crypto');
module.exports.createReadStream = function (hash, ondata, encoding) {
return new Promise((resolve, reject) => {
const stream = fs.createReadStream(
hash,
{encoding}
);
const cryptoHashObj = crypto.createHash('sha256');
let buf;
if (encoding) {
stream.on('data', data => {
cryptoHashObj.update(Buffer.from(data, 'utf8'));
ondata(data);
});
} else {
// Convert the node.js Buffer to an ArrayBuffer and adjust the size if it is not full
stream.on('data', data => {
cryptoHashObj.update(data);
if (data.length < data.buffer.byteLength) {
ondata(data.buffer.slice(0, data.length));
} else {
ondata(data.buffer);
}
});
}
stream.on('error', err => reject(err));
stream.on('end', () => resolve(cryptoHashObj.digest('hex')));
});
};
module.exports.createWriteStream = function (filename, encoding) {
const cryptoHashObj = crypto.createHash('sha256');
const stream = fs.createWriteStream(
filename,
{encoding}
);
const write = data => {
const buf = typeof data === 'string' ?
Buffer.from(data, encoding) :
Buffer.from(data);
cryptoHashObj.update(buf);
stream.write(buf);
};
const end = () => {
return new Promise((resolve, reject) => {
const hash = cryptoHashObj.digest('hex');
stream.once('error', err => reject(err));
stream.once('finish', () => resolve(hash));
stream.end();
});
};
stream.once('error', err => console.error('Stream error', err));
return {
write,
end
};
};
Test file 2: main
'use strict';
const streams = require('./streams.js');
const UTF8_FILE = '9101a84eb2320001628926e3c4decd09eff6680809e95a920db4e18afbcf0201';
const BINARY_FILE = 'ee1758d957bac3706b6a0ad450ffaeab34d55d5c6d5988a8bef7ce72c8a7db85';
const txtFile = async function () {
console.log('\nTEXT FILE TEST\n');
const writeStream = streams.createWriteStream('copy-' + UTF8_FILE, 'utf8');
const onTxtData = function (data) {
if (data instanceof ArrayBuffer) {
throw new Error('What?');
}
console.log(
' TEXT CHUNK',
data.substr(0, 75).replace(/\n/g, '') + '...',
data.length
);
writeStream.write(data);
};
const readTextFileHash = await streams.createReadStream(UTF8_FILE, onTxtData, 'utf8');
const writeTextFileHash = await writeStream.end();
console.log('\nOriginal text file hash:', UTF8_FILE);
console.log('Read file calculated hash:', readTextFileHash);
console.log('Write file calculated hash:', writeTextFileHash);
};
const binFile = async function () {
console.log('\nBINARY FILE TEST\n');
const writeStream = streams.createWriteStream('copy-' + BINARY_FILE);
const onBinaryData = function (data) {
if (typeof data === 'string') {
throw new Error('What?');
}
console.log(
' BINARY CHUNK',
String.fromCharCode.apply(null, new Uint8Array(data)).substr(0, 75).replace(
/[^A-Za-z 0-9 \.,\?""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~]*/g,
''
) + '...',
data.byteLength
);
writeStream.write(data);
};
const readBinaryFileHash = await streams.createReadStream(BINARY_FILE, onBinaryData);
const writeBinaryFileHash = await writeStream.end();
console.log('\nOriginal binary file hash:', BINARY_FILE);
console.log('Read file calculated hash:', readBinaryFileHash);
console.log('Write file calculated hash:', writeBinaryFileHash);
};
const main = async function () {
await txtFile();
await binFile();
// To run more of those two tests in any sequenc:
// await txtFile();
// await txtFile();
// await txtFile();
// await binFile();
console.log('\nYou can also compare ');
};
main().catch(console.error);
This code is quite strange if all it is doing is copying files and calculating a hash while it happens. With the non-standard syntax here it is going to be impossible to determine if this really is a bug in Node.js, whatever transpiling utility you're using, or in your code. Can I ask you to remove the non-standard syntax from this and see if it has the same problem.
It is standard javascript, it merely has type annotations. They don't do anything. "Transpiling" merely means removing the types. This is Flow, not TypeScript. I'll remove the types. They are just like comments.
I updated the code above to remove the types.
The code is the way it is because the actual use case is a) cross platform (React Native, Browser, node.js) to read files on one node, send them via websocket, write them on another. That is why I have a frontend for streams, there is a different one on each platform. Just for the explanation, I think the code is simple enough, just standard stream things, I don't do anything fancy at all. I mean, The node.js part is just read a stream in chunks and write them to a write stream, Most of main.js is just to create a nice example.
And by the way, all that encoding stuff is necessary for the React Native platform which can do binary only using base64 encoding(!). Otherwise I would have only Buffers and no string streams.
FWIW, there is a much simpler way of doing what you're trying to do here, but I'm not going to go into that now.
@addaleax ... I'm wondering if the use of Promises here is causing some strange interaction with the changes from https://github.com/nodejs/node/pull/21968, which landed in 10.8.0.
@addaleax ... I'm wondering if the use of Promises here is causing some strange interaction with the changes from #21968, which landed in 10.8.0.
Looking into it, I think it’s indeed that PR, in that the more efficient usage of the underlying ArrayBuffers is what’s causing this:
if (data.length < data.buffer.byteLength) {
ondata(data.buffer.slice(0, data.length));
} else {
ondata(data.buffer);
}
This piece of your code doesn’t seem to do what it should; what if data.byteOffsetis not zero? You’d still return data from the start of the underlying ArrayBuffer. Sorry to disappoint, but this really doesn’t look like a bug in Node…
Ah, right, yep, just spotted that also. Prior to the change in #21968, that offset wouldn't really have mattered for fs streams but correct code still needs to account for it with arbitrary streams because that offset can definitely throw things off.
@addaleax ... I'm wondering if we shouldn't at least add a comment to the stream docs advising folks that they need to pay attention to the byteOffset on any buffer they receive.
Closing, as this appears to be a bug in the user code and not in Node.js. The solution is to properly account for byteOffset when slicing from the underlying ArrayBuffer within the user's createReadStream method. If this does not resolve the problem, please comment here and we will re-open and investigate further.
@jasnell
FWIW, there is a much simpler way of doing what you're trying to do here,
No there isn't, as I said, I have a cross-platform scenario with a lot of stuff around it. All stream modules are equal so that the main code can use one common stream interface, and I have to do the stupid string-based encoding stuff for React Native where binary streams are base64 strings.
When I change the zero to byteOffsetnothing changes. When I remove the entire "if" and just use the buffer nothing changes.
THAT IS A BUG IN NODE.JS
if (data.byteOffset !== 0 || data.length < data.buffer.byteLength) {
ondata(data.buffer.slice(data.byteOffset, data.byteOffset + data.length));
} else {
ondata(data.buffer);
}
This fixes your code.
Relevant documentation: https://nodejs.org/dist/latest-v10.x/docs/api/buffer.html#buffer_buf_byteoffset
When setting byteOffset in Buffer.from(ArrayBuffer, byteOffset, length)
or sometimes when allocating a buffer smaller than Buffer.poolSize the
buffer doesn't start from a zero offset on the underlying ArrayBuffer.
This can cause problems when accessing the underlying ArrayBuffer directly
using buf.buffer, as the first bytes in this ArrayBuffer may be unrelated
to the buf object itself.
A common issue is when casting a Buffer object to a TypedArray object, in
this case one needs to specify the byteOffset correctly:
// Create a buffer smaller than `Buffer.poolSize`.
const nodeBuffer = new Buffer.from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
// When casting the Node.js Buffer to an Int8 TypedArray remember to use the
// byteOffset.
new Int8Array(nodeBuffer.buffer, nodeBuffer.byteOffset, nodeBuffer.length);
@addaleax Indeed it does. However, something changed from 10.7.0 to 10.8.0 and I always read the changelog — there was nothing in it that seemed relevant...
@lll000111 I’m not sure – the relevant PR is the one linked above by @jasnell, https://github.com/nodejs/node/pull/21968. It’s one of only two entries in the 10.8.0 changelog that target the fs subsystem, and it’s not something that should typically affect users directly…
'use strict'
const cloneable = require('cloneable-readable')
const fs = require('fs')
const crypto = require('crypto')
const { Transform, pipeline } = require('stream')
const stream = cloneable(fs.createReadStream(__filename))
const hash = crypto.createHash('sha256')
const encoder = new Transform({
transform(chunk, encoding, callback) {
callback(null, chunk.toString('hex'))
}
})
pipeline(stream.clone(), hash, encoder, process.stdout)
pipeline(stream, process.stdout)
@jasnell Is that code about your earlier comment
FWIW, there is a much simpler way of doing what you're trying to do here,
Because I already said a few things about that. I can't use fancy node.js specific things.