Cheerio: Large strings not shown properly

Created on 28 May 2019  路  6Comments  路  Source: cheeriojs/cheerio

When I'm looking for the src-Attribute of an image wich has a base64 string I would get back a very long string (up to 100 KB) but instead cheerio returns a shortened version to me, wich looks something like this:

data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==

I think thats intentional wich I can understand to prevent damage but I'm aware that I'll get back a very large string. How can I get the unshortened version?

Most helpful comment

Call it boredom, but I tried make what I suggested earlier.

const cheerio = require('cheerio');
const got = require('got');
const fs = require('fs').promises;

const xbox = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36';

(async () => {
    try {
        //const url = 'https://www.google.com/search?tbm=shop&hl=de-de&tbs=vw:l&q=xbox';
        const url = 'https://www.google.com/search?q=xbox&hl=de-de&tbm=isch&sa=X&biw=2560&bih=1311';
        //const body = (await got(url, { headers: { 'User-Agent': xbox } })).body;
        //await fs.writeFile('dump.html', body, 'utf8').then(() => console.log('The file was saved!'));

        const body = await fs.readFile('dump.html', 'utf8');

        const $ = cheerio.load(body);

        // just get images from script tags
        if (false) {
            let img = [];
            $('script:contains("setImgSrc\\(")').each((ix, val) => {
                const _setImgSrc = (a, c) => (img[a] = c);
                eval($(val).html());
            });
            img.forEach((c) => console.log(c));
        }

        // replace place holders
        if (true) {
            let d = {};
            // search and keep defer images
            $('img[data-iid]').each((ix, val) => (d[$(val).attr('data-iid')] = val));

            // evaluate script
            $('script:contains("setImgSrc\\(")').each((ix, val) => {
                const _setImgSrc = (a, c) => {
                    if (d[a]) $(d[a]).removeAttr('data-iid').attr('src', c);
                };
                eval($(val).remove().html());
            });
        }

        // turn it all into html again
        const html = $.html();
        await fs.writeFile('dump (images added).html', html, 'utf8').then(() => console.log('The file was saved!'));
        //console.log(html);
    } catch (error) {
        console.log(error);
    }
})();

All 6 comments

I want to get article images from google shopping. For this I am using cheerio and nodejs. So if i'm surfing to a link for a xbox for example (https://www.google.com/search?tbm=shop&hl=de-de&tbs=vw:l&q=xbox) and I'm inspecting the image in the developer tools I get a very long base64 string. But when doing the same in cheerio I get a string that looks something like this:

data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==

Furthermore when opening it in chrome it get's encoded as webp.

I already tried changing my User-Agent with the following code:

var customHeaderRequest = request.defaults({
    headers: {'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'},
});

But still I get shortened image-data. What else could I try to get the whole image-string with cheerio?

Edit: How to recreate the error:

At first you have to create a custom Header request with the given user agent. Then you pass to it the url mentioned above (xbox google shopping), this should return a body variable with the html-code inside wich you can pass on to cheerio (I also built in alittel code to save a dump of the code to see how the server responds in contrast to how cheerio processes it). It should then look for the given classes wich include the image with the base64 data. On the console you should now see the malformed data.

const cheerio = require('cheerio');
const request = require('request');
const fs = require('fs');


var customHeaderRequest = request.defaults({
    headers: {'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'},
});

url = 'https://www.google.com/search?tbm=shop&hl=de-de&tbs=vw:l&q=xbox';

customHeaderRequest.get(url, function(err, resp, body) {

    //write curled body to a file for further inspection
    fs.writeFile("dump.html", body, function(err) {
        if(err) {
            console.log("file saved");
            return console.log(err);
        }

        console.log("The file was saved!");
    });

    $ = cheerio.load(body);

    $('.sh-dlr__list-result .sh-dlr__content').each((index, value) => {

        let entryObj = {};
        //image
        $(value).find('.TL92Hc').each(function (idx, ele) {
            //image
            console.log($(ele).attr('src'));

        });
    });

});

If I execute this script it gives me the following output:

C:\mydir>node reproduce_error.js
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==

@maxkraft7 I'm having the same issue, did you find a solution?

It would be great to have some markup that has this issue. Right now reproducing the issue is tough.

I loaded some source from google and I discovered it is probably not cheerios fault. dump.html

Google uses little trick there - when page is loaded from their site, image tags are filled with those place holders (seen above). Later when browser has finished rendering the page, browser uses javascript to replace those place holders with real images. It actually helps render pages quicker.

It may be confusing when you look source in browser you see big pictures but it is because browser has already replaced images. Sadly Cheerio can only read values what are in image tags during load.

Interestingly image data is loaded with page, but data is stored in script tags, like so:

<script nonce="6yvz7b/n1cGMHtmd5Ot8FA">_setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJRgABA
...
AAAAAAAAAAoAAAAAAAAAAAAAAAP\/\/Z');</script>

you could actually find those script tags and extract data from those.

@5saviahv Thanks for digging deeper!

Closing the ticket as out of scope.

Call it boredom, but I tried make what I suggested earlier.

const cheerio = require('cheerio');
const got = require('got');
const fs = require('fs').promises;

const xbox = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36';

(async () => {
    try {
        //const url = 'https://www.google.com/search?tbm=shop&hl=de-de&tbs=vw:l&q=xbox';
        const url = 'https://www.google.com/search?q=xbox&hl=de-de&tbm=isch&sa=X&biw=2560&bih=1311';
        //const body = (await got(url, { headers: { 'User-Agent': xbox } })).body;
        //await fs.writeFile('dump.html', body, 'utf8').then(() => console.log('The file was saved!'));

        const body = await fs.readFile('dump.html', 'utf8');

        const $ = cheerio.load(body);

        // just get images from script tags
        if (false) {
            let img = [];
            $('script:contains("setImgSrc\\(")').each((ix, val) => {
                const _setImgSrc = (a, c) => (img[a] = c);
                eval($(val).html());
            });
            img.forEach((c) => console.log(c));
        }

        // replace place holders
        if (true) {
            let d = {};
            // search and keep defer images
            $('img[data-iid]').each((ix, val) => (d[$(val).attr('data-iid')] = val));

            // evaluate script
            $('script:contains("setImgSrc\\(")').each((ix, val) => {
                const _setImgSrc = (a, c) => {
                    if (d[a]) $(d[a]).removeAttr('data-iid').attr('src', c);
                };
                eval($(val).remove().html());
            });
        }

        // turn it all into html again
        const html = $.html();
        await fs.writeFile('dump (images added).html', html, 'utf8').then(() => console.log('The file was saved!'));
        //console.log(html);
    } catch (error) {
        console.log(error);
    }
})();

Was this page helpful?
0 / 5 - 0 ratings

Related issues

rajkumarpb picture rajkumarpb  路  3Comments

chenweiyj picture chenweiyj  路  5Comments

dandv picture dandv  路  5Comments

collegepinger picture collegepinger  路  3Comments

becush picture becush  路  3Comments