Hi! I have to work with some pdf files coming from Amazon S3 as a buffer. I'm using the S3 getObject method, which returns the file as a buffer in the body property:
http://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html#getObject-property
Body — (Buffer, Typed Array, Blob, String, ReadableStream) Object data.
How can I parse it? I want to get the total number of pages and then split the whole pdf in separate pages.
Thanks!
Many questions here. some are actually about hummus, but i think i can help with all of them:
To download an amazon stream, you can use the getObject method and then based on that create a read stream that you can use as is or pipe to something else (like a file writing stream).
like this:
var s3 = new aws.S3({
params: {
Bucket: myBucket,
}
});
var readStream = s3.getObject({Key:remoteData.data.remoteKey}).createReadStream();
// use as is or pipe to a writable stream like this:
readStream.pipe(targetStream);
// can use 'end' event to know when the writing is finished
readStream.on('end', function(){
// writing done
})
For hummus, due to the requirement of random access, you cant use readable streams directly and you'll have to either pipe this to a file, or to a memory buffer that can now be used as input to hummus.
here's how you'd pipe the readstream into a file:
var file = fs.createWriteStream(inTargetFilePath);
readStream.pipe(file);
file.on('finish', function() {
file.close(function() {/* now do whatever you want with the file*/}
});
parsing with hummus and getting things like the number of pages, is explained here.
e.g.:
var pdfReader = hummus.createReader('./TestMaterials/XObjectContent.PDF');
pdfReader.getPageCount();
Splitting is done simply be creating a new PDF file per page in the original PDF and copying the original page content to the new page object in the new file. here's a sample script that you can use with a plain file:
var hummus = require('hummus');
pdfReader = hummus.createReader('myfile.pdf');
for(var i=0;i<pdfReader.getPagesCount();++i){
pdfWriter = hummus.createWriter('./output/output' + i + '.pdf')
pdfWriter.createPDFCopyingContext(pdfReader).appendPDFPageFromPDF(i);
pdfWriter.end();
}
```var fs = require('fs');
/*
PDFRStreamForBuffer is an implementation of a read stream using a supplied array
@author Luciano Júnior
*/
function PDFRStreamForBuffer(buffer){
this.innerBuffer = buffer;
this.rposition = 0;
this.fileSize = buffer.byteLength;
}
PDFRStreamForBuffer.prototype.read = function(inAmount){
var arr = [];
for(var i = 0; i < inAmount; i++){
arr.push(this.innerBuffer[this.rposition+i]);
}
this.rposition += inAmount;
return arr;
}
PDFRStreamForBuffer.prototype.notEnded = function(){
return this.rposition < this.fileSize;
}
PDFRStreamForBuffer.prototype.setPosition = function(inPosition){
this.rposition = inPosition;
}
PDFRStreamForBuffer.prototype.setPositionFromEnd = function(inPosition){
this.rposition = this.fileSize-inPosition;
}
PDFRStreamForBuffer.prototype.skip = function(inAmount){
this.rposition += inAmount;
}
PDFRStreamForBuffer.prototype.getCurrentPosition = function(){
return this.rposition;
}
module.exports = PDFRStreamForBuffer;
```
Insert this code in a file and use it like a PDFRStreamForFile
@galkahana Could you please give an example how can I use 'memory buffer' that you mentioned instead of a file? I've done a lot of googling/research but still cannot find any constructor for it exported from HummusJs.
you got it here - PDFRStreamForBuffer
This might be helpful for someone: http://stackoverflow.com/questions/42512982/node-js-get-the-first-page-of-pdf-buffer
FYI, I ran into some performance issues with the example provided. It looks like Hummus will read bytes beyond the buffer, so I ended up doing this instead:
read() {
const previousPosition = this.position;
this.position += inAmount;
return [...this.innerBuffer.slice(previousPosition, this.position)];
}
(If you're not using latest Nodejs or Babel, just use [].concat(this.innerBuffer.slice(previousPosition, this.position)). Slice doesn't return a "true" array, so Hummus will blow up.)
Many questions here. some are actually about hummus, but i think i can help with all of them:
Amazon stream, downloading, using, writing to something that hummus can use
To download an amazon stream, you can use the getObject method and then based on that create a read stream that you can use as is or pipe to something else (like a file writing stream).
like this:var s3 = new aws.S3({ params: { Bucket: myBucket, } }); var readStream = s3.getObject({Key:remoteData.data.remoteKey}).createReadStream(); // use as is or pipe to a writable stream like this: readStream.pipe(targetStream); // can use 'end' event to know when the writing is finished readStream.on('end', function(){ // writing done })For hummus, due to the requirement of random access, you cant use readable streams directly and you'll have to either pipe this to a file, or to a memory buffer that can now be used as input to hummus.
here's how you'd pipe the readstream into a file:
var file = fs.createWriteStream(inTargetFilePath); readStream.pipe(file); file.on('finish', function() { file.close(function() {/* now do whatever you want with the file*/} });Parsing with hummus, getting the number of pages
parsing with hummus and getting things like the number of pages, is explained here.
e.g.:
var pdfReader = hummus.createReader('./TestMaterials/XObjectContent.PDF'); pdfReader.getPageCount();Splitting
Splitting is done simply be creating a new PDF file per page in the original PDF and copying the original page content to the new page object in the new file. here's a sample script that you can use with a plain file:
var hummus = require('hummus'); pdfReader = hummus.createReader('myfile.pdf'); for(var i=0;i<pdfReader.getPagesCount();++i){ pdfWriter = hummus.createWriter('./output/output' + i + '.pdf') pdfWriter.createPDFCopyingContext(pdfReader).appendPDFPageFromPDF(i); pdfWriter.end(); }
Thankyou so much for this code saved my day! God Bless
Most helpful comment
```var fs = require('fs');
/*
PDFRStreamForBuffer is an implementation of a read stream using a supplied array
*/
function PDFRStreamForBuffer(buffer){
this.innerBuffer = buffer;
this.rposition = 0;
this.fileSize = buffer.byteLength;
}
PDFRStreamForBuffer.prototype.read = function(inAmount){
var arr = [];
}
PDFRStreamForBuffer.prototype.notEnded = function(){
return this.rposition < this.fileSize;
}
PDFRStreamForBuffer.prototype.setPosition = function(inPosition){
this.rposition = inPosition;
}
PDFRStreamForBuffer.prototype.setPositionFromEnd = function(inPosition){
this.rposition = this.fileSize-inPosition;
}
PDFRStreamForBuffer.prototype.skip = function(inAmount){
this.rposition += inAmount;
}
PDFRStreamForBuffer.prototype.getCurrentPosition = function(){
return this.rposition;
}
module.exports = PDFRStreamForBuffer;
```
Insert this code in a file and use it like a PDFRStreamForFile