Json: How to solve large json file?

Created on 21 Jan 2018  路  12Comments  路  Source: nlohmann/json

My json file has a array of approximately ten million data.
ucrtbased.dll!0f5160d0() Unknown
[Frames below may be incorrect and/or missing, no symbols loaded for ucrtbased.dll]
[External Code]

WenKuInfoProcess.exe!std::_Allocate(unsigned int _Count, unsigned int _Sz, bool _Try_aligned_allocation) Line 87 C++
WenKuInfoProcess.exe!std::allocator::allocate(unsigned int _Count) Line 828 C++
WenKuInfoProcess.exe!std::_Wrap_alloc >::allocate(unsigned int _Count) Line 1078 C++
WenKuInfoProcess.exe!std::_String_alloc > >::_Alloc_proxy() Line 1776 C++
WenKuInfoProcess.exe!std::_String_alloc > >::_String_alloc > > >,void>(std::_Wrap_alloc > && _Al) Line 1731 C++
WenKuInfoProcess.exe!std::basic_string,std::allocator >::basic_string,std::allocator >(std::basic_string,std::allocator > && _Right) Line 2055 C++
WenKuInfoProcess.exe!std::pair,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >::pair,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer>,void,0>(std::basic_string,std::allocator > && _Val1, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> && _Val2) Line 188 C++
WenKuInfoProcess.exe!std::allocator,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,void *> >::construct,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,std::basic_string,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >(std::pair,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> > * _Ptr, std::basic_string,std::allocator > && <_Args_0>, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> && <_Args_1>) Line 840 C++
WenKuInfoProcess.exe!std::allocator_traits,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,void *> > >::construct,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,std::basic_string,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >(std::allocator,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,void *> > & _Al, std::pair,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> > * _Ptr, std::basic_string,std::allocator > && <_Args_0>, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> && <_Args_1>) Line 960 C++
WenKuInfoProcess.exe!std::_Wrap_alloc,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,void *> > >::construct,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >,std::basic_string,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >(std::pair,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> > * _Ptr, std::basic_string,std::allocator > && <_Args_0>, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> && <_Args_1>) Line 1096 C++
WenKuInfoProcess.exe!std::_Tree_comp_alloc,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer>,std::less,std::allocator > >,std::allocator,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> > >,0> >::_Buynode,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >(std::basic_string,std::allocator > && <_Val_0>, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> && <_Val_1>) Line 902 C++
WenKuInfoProcess.exe!std::_Tree,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer>,std::less,std::allocator > >,std::allocator,std::allocator > const ,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> > >,0> >::emplace,std::allocator >,nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >(std::basic_string,std::allocator > && <_Val_0>, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> && <_Val_1>) Line 1084 C++
WenKuInfoProcess.exe!nlohmann::detail::parser,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >::parse_internal(bool keep, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> & result) Line 3083 C++
WenKuInfoProcess.exe!nlohmann::detail::parser,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >::parse_internal(bool keep, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> & result) Line 3150 C++
WenKuInfoProcess.exe!nlohmann::detail::parser,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >::parse_internal(bool keep, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> & result) Line 3076 C++
WenKuInfoProcess.exe!nlohmann::detail::parser,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> >::parse(const bool strict, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> & result) Line 2941 C++
WenKuInfoProcess.exe!nlohmann::operator>>(std::basic_istream > & i, nlohmann::basic_json,std::allocator >,bool,__int64,unsigned __int64,double,std::allocator,nlohmann::adl_serializer> & j) Line 13150 C++
WenKuInfoProcess.exe!main(int argc, char * * argv) Line 22 C++
[External Code]

proposed fix

Most helpful comment

Example:

#include <iostream>
#include <fstream>
#include "json.hpp"

using json = nlohmann::json;

int main()
{
    std::size_t count = 0;

    auto x = [&count](int depth, json::parse_event_t event, json& parsed) {
        if (event == json::parse_event_t::object_end)
        {
            ++count;
            return false; // do not store the object value
        }
        else
        {
            return true;
        }
    };

    std::ifstream f("wk_file_list.json");
    json::parse(f, x);

    std::cerr << "file has " << count << " elements\n";
}

Output: file has 1992511 elements.

On my machine, this takes less than 3 MB of memory.

All 12 comments

Does this error message occur during parsing?

Yes~. Code is simple. std::fstream read file, pass data to json obj.

Right now, we only support DOM-like parsing to memory. There is an experimental SAX-like approach, but it has not been merged yet. With such an approach, you may parse and process the input without the need of converting each element to a JSON value and storing it.

Would it be possible to share your input so we could experiment whether the new approach would help?

Ok, Laterly I will upload data to Google Drive and paste link here. Finally I choose sed and grep to parse my data.

Thanks!

Thanks. What did you want to do with the file after parsing? I am asking because it is possible to define a callback function to be called during parsing, see https://nlohmann.github.io/json/classnlohmann_1_1basic__json_ab4f78c5f9fd25172eeec84482e03f5b7.html#ab4f78c5f9fd25172eeec84482e03f5b7

With such a function, you can process the input during parsing without the need of actually storing all input data.

Example:

#include <iostream>
#include <fstream>
#include "json.hpp"

using json = nlohmann::json;

int main()
{
    std::size_t count = 0;

    auto x = [&count](int depth, json::parse_event_t event, json& parsed) {
        if (event == json::parse_event_t::object_end)
        {
            ++count;
            return false; // do not store the object value
        }
        else
        {
            return true;
        }
    };

    std::ifstream f("wk_file_list.json");
    json::parse(f, x);

    std::cerr << "file has " << count << " elements\n";
}

Output: file has 1992511 elements.

On my machine, this takes less than 3 MB of memory.

Hi @nlohmann ,
I think I have the same problem. I have to process a very big JSON file to store its content in a MongoDB database. I am talking about a 85 Mb on disk file. Can I use that function to avoid memory problems?

The code above is just an example how to cope with a single array of objects. You may need to adjust it to your needs. Please also have a look at the documentation.

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.

A small update on this (as it was mentioned in #971):

  • With the SAX parser, the syntax check takes 3 MB of RAM and about 7 seconds to complete.
  • The new DOM parser takes 12 seconds and 2.7 GB of RAM to read the complete file. In comparison, jq takes 25 seconds and roughly the same amount of memory.
Was this page helpful?
0 / 5 - 0 ratings

Related issues

jmlemetayer picture jmlemetayer  路  3Comments

alienzj picture alienzj  路  4Comments

moneroexamples picture moneroexamples  路  4Comments

Prati369 picture Prati369  路  4Comments

mlund picture mlund  路  4Comments