Hi,
I have an ETL job to parse & load a large text file and insert the data into mysql database. When script parses the file, it stores the data in a large array. (https://github.com/skilbjo/costs.js/blob/master/src/Vantiv_Import_Stream.js)
I often receive the error: [Error: ER_NET_PACKET_TOO_LARGE: Got a packet bigger than 'max_allowed_packet' bytes] because the data is so large.
In the docs you mentioned:
Additionally you may be interested to know that it is currently not possible
to stream individual row columns, they will always be buffered up entirely.
If you have a good use case for streaming large fields to and from MySQL,
I'd love to get your thoughts and contributions on this.
I think this would be a good use case!
Here is my code, but I still get the same ER_NET_PACKET_TOO_LARGE error:
var data = ['large 100mb+ array']
var sql = 'insert into ' + table +
'(idVantiv, Month, Merchant_Id, Merchant_Descriptor, Network, Qualification_Code, Transaction_Type, '+
' Issuer_Type, Card_Type, Txn_Count, Txn_Amount, ' +
'Interchange) values ?';
query.connection.query(sql, [data])
.on('error', function(err){
console.log('Error: ', err);
})
.on('fields', function(fields){
console.log(fields);
})
.on('result', function(row){
console.log(row);
})
.on('end', function() {
h.connection.end();
});
hi.
maybe you can seperate data into smaller parts and use it with async.
normally a big array
my_big_array = [
[a,b,c],
[d,e,f],
[g,h,j],
[k,l,p]
]
parsed_big_array = [
[[a,b,c],[d,e,f]],
[[g,h,j],[k,l,p]]
]
async.times(parsed_big_array.length, function(n, next){
var data = parsed_big_array [n];
connection.query('INSERT INTO table SET ?', {c1:data }, function(err, result) {
next(null);
});
}, function(err) {
next(err, d2)
});`
@synanhero I could kiss you
It makes sense... instead of trying 1x time to insert 30,000 rows; I'll try 300x times to insert 1,000 rows. A bit more overhead, but scaleable for n number of rows.
rl.on('close', function() {
h.connection.connect();
reduceArray(large_array, function(small_array){
db(small_array);
});
});
var reduceArray = function(arr, cb){
var size = 1000,
small_array = []
;
for (var i=0; i<arr.length; i+=size){
var chunk = arr.slice(i,i+size);
small_array.push(chunk);
};
cb(small_array);
};
/* Insert into SQL */
var db = function(small_array){
async.times(small_array.length, function(n, next){
var data = small_array[n];
h.connection.query(sql, [data], function(err,result){
next(err, result);
});
}, function(err){
console.log(err);
});
}
I've done something similar. I was decoding 1,000,000 rows of data from an Avro file and streaming that out in batches of 1000. This works well, with the exception that the Avro file decodes a LOT faster than the SQL statements can run, meaning I was left with almost the entire result set in memory.
Running 33 insert statements isn't so bad, but I was running 1000 insert statements.
It worked absolutely fine, but I was a little worried I could potentially run out of memory if it got any bigger.
Another option I adopted was to write my data to a csv file, then use the SQL
LOAD DATA INFILE "my/file/path.csv"
INTO TABLE my_table
CHARACTER SET utf8
FIELDS TERMINATED BY ","
LINES TERMINATED BY "\n"
You might find this book I just wrote helpful