HDFstore is working much better with the latest release, but I am encountering a new error I wanted to report:
df = get_joined_data(data_prefix, data_rev_prefix, date_prefix, store_filename)
File "XXXXXX", line 739, in get_joined_data
write_dataframe("joined_{0}".format(date_prefix), df, store)
File "XXXXXXX", line 55, in write_dataframe
store[name] = df
File "/Library/Python/2.7/site-packages/pandas/io/pytables.py", line 218, in setitem
self.put(key, value)
File "/Library/Python/2.7/site-packages/pandas/io/pytables.py", line 458, in put
self._write_to_group(key, value, table=table, append=append, *_kwargs)
File "/Library/Python/2.7/site-packages/pandas/io/pytables.py", line 788, in _write_to_group
s.write(obj = value, append=append, complib=complib, *_kwargs)
File "/Library/Python/2.7/site-packages/pandas/io/pytables.py", line 1837, in write
self.write_array('block%d_values' % i, blk.values)
File "/Library/Python/2.7/site-packages/pandas/io/pytables.py", line 1627, in write_array
vlarr.append(value)
File "/Library/Python/2.7/site-packages/tables-2.4.0-py2.7-macosx-10.8-intel.egg/tables/vlarray.py", line 480, in append
self._append(nparr, nobjects)
File "hdf5Extension.pyx", line 1499, in tables.hdf5Extension.VLArray._append (tables/hdf5Extension.c:13764)
OverflowError: value too large to convert to int
Not at all sure this is an actual pandas issue, but thought I would report it nonetheless.
can u post the DataFrame summary (str(df))
as well as a sample of the data (df.head())?
Sure, it takes a while to build so I'll try and get it done in the next couple of days.
@jostheim is this still an issue?
Closing b/c I am not sure, and I put a csv style way around this in our other thread.
Hi guys,
I'm having the same issue. Did anyone find a fix ?
Thanks
well @z00b2008 w/o any detail impossible to know
show pd.show_versions(), df.info(), you code and the error
Good point, some details:
-- pd.show_versions():
commit: None
python: 2.7.6.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-27-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: en_US.UTF-8
LANG: en_US.UTF-8
pandas: 0.13.1
Cython: None
numpy: 1.9.0
scipy: 0.14.0
statsmodels: 0.5.0
IPython: 1.2.1
sphinx: None
patsy: 0.2.1
scikits.timeseries: None
dateutil: 1.5
pytz: 2012c
bottleneck: None
tables: 3.1.1
numexpr: 2.2.2
matplotlib: 1.3.1
openpyxl: 1.7.0
xlrd: 0.9.2
xlwt: 0.7.5
xlsxwriter: None
sqlalchemy: None
lxml: None
bs4: 4.2.1
html5lib: 0.999
bq: None
apiclient: None
-- df.info():
DatetimeIndex: 39461339 entries, 2013-04-08 09:24:30 to 2013-04-08 09:24:00
Data columns (total 22 columns):
start-time datetime64[ns]
end-time datetime64[ns]
duration float64
rtt float64
proto int64
sip object
sp int64
dip object
dp int64
iflags object
uflags object
riflags object
ruflags object
isn object
risn object
tag object
rtag object
pkt float64
oct float64
rpkt int64
roct int64
app int64
dtypes: datetime64ns, float64(4), int64(6), object(10)
my code: data.to_hdf('/tmp/truc.h5', 'df')
--error:
/usr/lib/python2.7/dist-packages/pandas/io/pytables.pyc in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs) [28/773]
277 with get_store(path_or_buf, mode=mode, complevel=complevel,
278 complib=complib) as store:
--> 279 f(store)
280 else:
281 f(path_or_buf)
/usr/lib/python2.7/dist-packages/pandas/io/pytables.pyc in
272 f = lambda store: store.append(key, value, *_kwargs)
273 else:
--> 274 f = lambda store: store.put(key, value, *_kwargs)
275
276 if isinstance(path_or_buf, compat.string_types):
/usr/lib/python2.7/dist-packages/pandas/io/pytables.pyc in put(self, key, value, format, append, *_kwargs)
819 format = get_option("io.hdf.default_format") or 'fixed'
820 kwargs = self._validate_format(format, kwargs)
--> 821 self._write_to_group(key, value, append=append, *_kwargs)
822
823 def remove(self, key, where=None, start=None, stop=None):
/usr/lib/python2.7/dist-packages/pandas/io/pytables.pyc in _write_to_group(self, key, value, format, index, append, complib, encoding, *_kwargs)
1271
1272 # write the object
-> 1273 s.write(obj=value, append=append, complib=complib, *_kwargs)
1274
1275 if s.is_table and index:
/usr/lib/python2.7/dist-packages/pandas/io/pytables.pyc in write(self, obj, **kwargs)
2677 blk = data.blocks[i]
2678 # I have no idea why, but writing values before items fixed #2299
-> 2679 self.write_array('block%d_values' % i, blk.values, items=blk.items)
2680 self.write_index('block%d_items' % i, blk.items)
2681
/usr/lib/python2.7/dist-packages/pandas/io/pytables.pyc in write_array(self, key, value, items)
2448 vlarr = self._handle.createVLArray(self.group, key,
2449 _tables().ObjectAtom())
-> 2450 vlarr.append(value)
2451 else:
2452 if empty_array:
/usr/lib/python2.7/dist-packages/tables/vlarray.pyc in append(self, sequence)
534 nparr = None
535
--> 536 self._append(nparr, nobjects)
537 self.nrows += 1
538
/usr/lib/python2.7/dist-packages/tables/hdf5extension.so in tables.hdf5extension.VLArray._append (tables/hdf5extension.c:18234)()
OverflowError: value too large to convert to int
Notes:
works fine with only the following warning:
/usr/lib/python2.7/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->values] [items->None]
warnings.warn(ws, PerformanceWarning)
/usr/lib/python2.7/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_values] [items->['riflags', 'ruflags', 'isn', 'risn', 'tag', 'rtag']]
warnings.warn(ws, PerformanceWarning)
By the way, does this kind of warning suggest that I'm storing the values of these columns wrongly ?
It's supposed to be strings.
Some of your object columns actually contain intergers AND strings. This is not allowed (and hence the error). Aside from being very inefficient, this prob confuses the writer which expects uniform types (or strings), and NOT actual objects. It ends up pickling them.
So clean/eliminate that data and you should be good.
You always prob want to store as format='table' if you actually want to query. See docs: http://pandas.pydata.org/pandas-docs/stable/io.html#table-format
Hi !
Thanks for your answers.
I tried converting all my columns which might contain mixed types to strings by doing this:
for col in df.columns:
if df[col].dtype == object:
df[col] = df[col].astype(str)
If seems to work as I can do something like:
df[df.columns[10:]].to_hdf('/tmp/truc.h5', 'df')
df[df.columns[:10]].to_hdf('/tmp/truc.h5', 'df')
without getting the warnings I was getting before.
However, I still cannot save the whole dataframe to one file.
I still get the same error (OverflowError: value too large to convert to int)
Any idea ?
so do something like this:
chunksize = 100
for i in len(df):
chunk = df.iloc[(i*chunksize):min((i+1)*chunksize,len(df))]
try:
df.to_hdf('....','df',mode='w')
except (Exception) as e:
print e
print df
print df.info()
to figure out what is causing the error, then post it here
Hi !
I ran the following code (slight modif from the one you gave me):
chunk_size = 100
start = 0
end = chunk_size-1
while end < df.shape[0]:
chunk = df.iloc[start:end]
try:
chunk.to_hdf('/tmp/truc.h5','df',mode='w')
except (Exception) as e:
print e
print chunk
print chunk.info()
start += chunk_size
end += chunk_size
This ran fine with no warning, no error.
Hi guys !
Any idea ?
Thanks a lot.
Hi, I also get the same error. Any news on this? Why is this issue closed?
this issue is closed because the op and subsequent poster didn't provide enough info to even see if there was a problem. if you are have an issue provide:
I believe I gave all the info you asked for except maybe the real data.
However, you have to realize that giving potentially critical data online is not always an option.
Pandas, is great, and it's only natural it gets used for software in corporate world, with the constraints it implies on data privacy.
I believe three people got the same issue on completely different data-sets, so it seems there is indeed an issue here.
If someone else can afford to provide the complete dataset on which the problem occurs, that would be great. I simply cannot.
Thanks anyway !
@z00b2008
last comment from you is it ran fine with no error
I still have no idea what your data looks like
if you cannot provide a sample then I cannot help you
I get that sometimes you cannot provide data and so must ask obliquely
but then you cannot always expect solutions
@z00b2008 and you didn't report whether using table format fixed the issue
with a large data set you generally should use table format
Well, that's right. My last comment suggests that storing chunks of the data (having corrected the mixed types as you suggested), runs fine. But storing the whole dataset still gives the same error.
I can unfortunately not provide even a sample.
However, I could try to reproduce the error with artificial data of similar size, types and value range.
I believe I got the same issue with the table format, but it's been a while so I could be wrong. Let me check that too and get back to you.
Any solution for this issue so far?
I'm having the same problem with a 167M rows df.
Code resulting the error:
df.to_hdf('df_1M_sorted_with_datetime.h5','table')
In [52]: df.info()
Int64Index: 167788827 entries, 163244803 to 5459812
Data columns (total 5 columns):
user_id object
latitude float64
longitude float64
location_id int64
created_at datetime64[ns]
dtypes: datetime64ns, float64(2), int64(1), object(1)
memory usage: 7.5+ GB
commit: None
python: 2.7.6.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-44-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
pandas: 0.15.2
nose: 1.3.4
Cython: 0.21.2
numpy: 1.9.1
scipy: 0.13.3
statsmodels: None
IPython: 1.2.1
sphinx: None
patsy: None
dateutil: 2.4.0
pytz: 2014.10
bottleneck: None
tables: 3.1.1
numexpr: 2.4
matplotlib: 1.4.2
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 0.999
httplib2: None
apiclient: None
rpy2: None
sqlalchemy: None
pymysql: None
psycopg2: None
Thanks,
I am having the same problem as well on the most recent version
of pandas. This is on a dataset from a Kaggle competition.
It can be found at
https://www.kaggle.com/c/avito-context-ad-clicks/download/AdsInfo.tsv.7z
but you need to join and agree to competition rules in order to download
it.
(BTW, I really like the following comment:
2812 # I have no idea why, but writing values before items fixed #2299 )
Here is a dump of the relevant info:
In [30]: pd.show_versions()
commit: None
python: 3.4.3.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-53-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
pandas: 0.16.2
nose: 1.3.4
Cython: 0.22
numpy: 1.9.2
scipy: 0.15.1
statsmodels: 0.6.1
IPython: 3.0.0
sphinx: 1.2.3
patsy: 0.3.0
dateutil: 2.4.2
pytz: 2015.4
bottleneck: None
tables: 3.1.1
numexpr: 2.3.1
matplotlib: 1.4.3
openpyxl: 1.8.5
xlrd: 0.9.3
xlwt: None
xlsxwriter: 0.6.7
lxml: 3.4.2
bs4: 4.3.2
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: 0.9.9
pymysql: None
psycopg2: None
In [32]: adsdf = pd.read_csv('AdsInfo.tsv', sep='\t')
In [33]: adsdf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 36893298 entries, 0 to 36893297
Data columns (total 7 columns):
AdID int64
LocationID float64
CategoryID float64
Params object
Price float64
Title object
IsContext int64
dtypes: float64(3), int64(2), object(2)
memory usage: 2.2+ GB
In [39]: store = pd.HDFStore('Adinfo.hdf')
In [40]: store['Adinfo_df'] = adsdf
/home/ubuntu/anaconda3/lib/python3.4/site-packages/pandas/io/pytables.py:2577: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['Params', 'Title']]
warnings.warn(ws, PerformanceWarning)
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-36-852b1be8b621> in <module>()
----> 1 store['AddInfo_df'] = adsdf
/home/ubuntu/anaconda3/lib/python3.4/site-packages/pandas/io/pytables.py in __setitem__(self, key, value)
419
420 def __setitem__(self, key, value):
--> 421 self.put(key, value)
422
423 def __delitem__(self, key):
/home/ubuntu/anaconda3/lib/python3.4/site-packages/pandas/io/pytables.py in put(self, key, value, format, append, **kwargs)
817 format = get_option("io.hdf.default_format") or 'fixed'
818 kwargs = self._validate_format(format, kwargs)
--> 819 self._write_to_group(key, value, append=append, **kwargs)
820
821 def remove(self, key, where=None, start=None, stop=None):
/home/ubuntu/anaconda3/lib/python3.4/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
1266
1267 # write the object
-> 1268 s.write(obj=value, append=append, complib=complib, **kwargs)
1269
1270 if s.is_table and index:
/home/ubuntu/anaconda3/lib/python3.4/site-packages/pandas/io/pytables.py in write(self, obj, **kwargs)
2812 # I have no idea why, but writing values before items fixed #2299
2813 blk_items = data.items.take(blk.mgr_locs)
-> 2814 self.write_array('block%d_values' % i, blk.values, items=blk_items)
2815 self.write_index('block%d_items' % i, blk_items)
2816
/home/ubuntu/anaconda3/lib/python3.4/site-packages/pandas/io/pytables.py in write_array(self, key, value, items)
2579 vlarr = self._handle.create_vlarray(self.group, key,
2580 _tables().ObjectAtom())
-> 2581 vlarr.append(value)
2582 else:
2583 if empty_array:
/home/ubuntu/anaconda3/lib/python3.4/site-packages/tables/vlarray.py in append(self, sequence)
534 nparr = None
535
--> 536 self._append(nparr, nobjects)
537 self.nrows += 1
538
hdf5extension.pyx in tables.hdf5extension.VLArray._append (tables/hdf5extension.c:17614)()
OverflowError: value too large to convert to int
@macd the PerformanceWarning is the clue. You have actual python objects in your data (in the specified columns), and NOT strings. These are pickled. This is overflowing and cannot work. You should NOT store objects in general like this.
Thanks. I tried to parse as strings (not this example), but was still
getting
objects. It wasn't clear (at least to me) from the docs on how to
accomplish
this. I had tried 'Params':'string_', which didn't work.
On Fri, Jun 19, 2015 at 11:56 AM, jreback [email protected] wrote:
@macd https://github.com/macd the PerformanceWarning is the clue. You
have actual python objects in your data (in the specified columns), and NOT
strings. These are pickled. This is overflowing and cannot work. You should
NOT store objects in general like this.—
Reply to this email directly or view it on GitHub
https://github.com/pydata/pandas/issues/2773#issuecomment-113605963.
no, strings are stored as object dtype. But that's NOT what the warning is about you have actual python objects, in there, e.g. something like
In [1]: df = DataFrame({'A' : [1,'5', 0.0]})
In [2]: df
Out[2]:
A
0 1
1 5
2 0
In [3]: df.dtypes
Out[3]:
A object
dtype: object
In [4]: df.loc[0,'A']
Out[4]: 1
In [5]: df.loc[1,'A']
Out[5]: '5'
In [6]: df.loc[2,'A']
Out[6]: 0.0
try .convert_objects(convert_numeric=True) on that column
Thanks for the tip, but it didn't work. There are two columns that I want
to
be strings. One is dict-like, and that is the one that is problematic. If
I delete
it, I can write out OK (but I still get the performance warning for the
other
column) and it reads back in OK too. I understand strings can have a
negative
performance impact, but it doesn't look like .convert_objects() can
serialize
a compound object back into a string once it has already been parsed that
way.
On Fri, Jun 19, 2015 at 12:32 PM, jreback [email protected] wrote:
no, strings are stored as object dtype. But that's NOT what the warning
is about you have actual python objects, in there, e.g. something likeIn [1]: df = DataFrame({'A' : [1,'5', 0.0]})
In [2]: df
Out[2]:
A
0 1
1 5
2 0In [3]: df.dtypes
Out[3]:
A object
dtype: objectIn [4]: df.loc[0,'A']
Out[4]: 1In [5]: df.loc[1,'A']
Out[5]: '5'In [6]: df.loc[2,'A']
Out[6]: 0.0try .convert_objects(convert_numeric=True) on that column
—
Reply to this email directly or view it on GitHub
https://github.com/pydata/pandas/issues/2773#issuecomment-113615111.
strings are just fine. You are trying to store an object. This simply won't work, and is in general not recommnedd for using inside a dataframe. Use base types (floats,int,datetime,strings). Storing a 'dict' should be done in another way, e.g. another table for example.
Hey guys,
I've got the same problem. I am thinking this might be caused by the fact that my data has missing values in some columns, and a missing value would make this column an 'object' type (and this might depend on your data source, SQL or csv, etc.). Did anyone try specifying dtypes for all columns? It might work - at least seems to work in my case.
Thanks.
I also just ran into this issue today trying to save a df I'd just fetched from Redshift. I'd run this same load_data_from_redshift query quite a few times, but this is the first time it's thrown this error. I just re-ran, and encountered the same error again.
I was able to find the column causing the issue! It's a column with JSON objects, in my case.
Sorry I can't give too many more details- this is for my job using the company's proprietary data. But if there's any info I can provide that helps with debugging, I'm happy to provide as much as I can.
To start, here are the results of pd.show_versions()
INSTALLED VERSIONS
------------------
commit: None
python: 2.7.12.final.0
python-bits: 64
OS: Linux
OS-release: 4.4.0-57-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: None.None
pandas: 0.19.2
nose: None
pip: 9.0.1
setuptools: 20.7.0
Cython: None
numpy: 1.12.0
scipy: 0.18.1
statsmodels: 0.6.1
xarray: None
IPython: 5.2.1
sphinx: None
patsy: 0.4.1
dateutil: 2.6.0
pytz: 2016.10
blosc: None
bottleneck: None
tables: 3.3.0
numexpr: 2.6.1
matplotlib: None
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 0.9999999
httplib2: None
apiclient: None
sqlalchemy: 1.1.4
pymysql: None
psycopg2: 2.6.2 (dt dec pq3 ext lo64)
jinja2: 2.9.5
boto: None
pandas_datareader: None
@ClimbsRocks you will get one of two effects as seen blow. If its a fixed format, your data would be pickled. If table the it will raise.
In [1]: df = DataFrame({'A': [1,2,3], 'B': [{'foo':1, 'bar': {'a':1, 'b':2}},np.nan,np.nan]})
In [2]: df
Out[2]:
A B
0 1 {'bar': {'a': 1, 'b': 2}, 'foo': 1}
1 2 NaN
2 3 NaN
In [3]: df.dtypes
Out[3]:
A int64
B object
dtype: object
In [4]: df.to_hdf('test.h5','df')
/Users/jreback/pandas/pandas/core/generic.py:1216: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['B']]
return pytables.to_hdf(path_or_buf, key, self, **kwargs)
In [6]: df.to_hdf('test3.h5','df', format='table')
TypeError: Cannot serialize the column [B] because
its data contents are [mixed] object dtype
Thanks for the comment @jreback !
The odd part is, I _am_ using format='fixed'. And, much like the OP, the error that's thrown is not a TypeError, but OverflowError: value too large to convert to int.
Here's the full stack trace:
Traceback (most recent call last):
File "train_model.py", line 51, in <module>
results = utils.load_data_from_redshift()
File "/home/preston/project_name/utils.py", line 64, in load_data_from_redshift
results.to_hdf(file_name, key='results', format='fixed', mode='w', append=False)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 1138, in to_hdf
return pytables.to_hdf(path_or_buf, key, self, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py", line 271, in to_hdf
f(store)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py", line 265, in <lambda>
f = lambda store: store.put(key, value, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py", line 874, in put
self._write_to_group(key, value, append=append, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py", line 1316, in _write_to_group
s.write(obj=value, append=append, complib=complib, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py", line 2878, in write
self.write_array('block%d_values' % i, blk.values, items=blk_items)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py", line 2646, in write_array
vlarr.append(value)
File "/home/preston/.local/lib/python2.7/site-packages/tables/vlarray.py", line 537, in append
self._append(nparr, nobjects)
File "tables/hdf5extension.pyx", line 1975, in tables.hdf5extension.VLArray._append (tables/hdf5extension.c:21848)
OverflowError: value too large to convert to int
@ClimbsRocks you are in charge of your dtypes. Not really sure your issue is.
I tried just deleting columns until it eventually wrote to file, and found that the final column I deleted before it stopped throwing that OverflowError was a column of json objects. I'm digging in now to try to figure out how to save the data, so if you have any thoughts, I would love to hear them!
To follow up here: I do still think this is a bug in pandas. The column is marked as dtype object (automatically determined by pandas as result of sql query). The way I was eventually able to write this to file was to split this column out into it's own df, and separately save that df to file. Note that I made no modifications to the data, simply saving that column into it's own df.
This could be caused by having quite a few dtype object columns. The one I ended up saving as it's own df was simply the largest of all the dtype object columns, so it might be something as simple as just trying to save too many characters at a time. It's also worth noting for anyone trying to debug this in the future that the column I eventually saved as it's own df is a json object.
Best of luck for anyone trying to fix this!
For anyone else facing this issue, it's likely caused by a json field. I just ran into this issue again, and fixed it by removing a different json field.
I have an example of this problem which is definitely NOT coming from writing a complex python object that can't be matched up with a ctype (which is what Jeff Reback was saying is the root of the problems above)
I have a large dataset which includes a categorical variable where every level of that variable is a string. Removing that categorical variable allows .to_hdf() to function as expected, when the variable is present it results in an error that 's consistent with the various examples shown above.
pd.show_versions()
allData.info()
# problem seems to be GroupDayofWeek, a categorigal variable combining two seperate levels in the data
print('The problem variable is composed of str type only!')
for abit in allData['GroupDayofWeek'].unique():
print abit, type(abit)
jj = allData.copy()
del jj['GroupDayofWeek']
print('writing without GroupDayofWeek')
jj.to_hdf('testme.hdf','allData')
print('Everything OK! Now writing with GroupDayofWeek')
jj['GroupDayofWeek']=allData['GroupDayofWeek']
jj.to_hdf('testme.hdf','allData')
INSTALLED VERSIONS
------------------
commit: None
python: 2.7.9.final.0
python-bits: 64
OS: Linux
OS-release: 3.16.0-4-amd64
machine: x86_64
processor:
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: None.None
pandas: 0.19.2
nose: 1.3.4
pip: 9.0.1
setuptools: 36.6.0
Cython: 0.21.1
numpy: 1.13.3
scipy: 1.0.0
statsmodels: 0.4.2
xarray: None
IPython: 5.3.0
sphinx: 1.5.5
patsy: 0.3.0
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: None
tables: 3.1.1
numexpr: 2.4
matplotlib: 2.1.0
openpyxl: 2.4.7
xlrd: 0.9.2
xlwt: 0.7.5
xlsxwriter: None
lxml: 3.4.0
bs4: None
html5lib: 0.999999999
httplib2: 0.9
apiclient: None
sqlalchemy: 0.9.8
pymysql: None
psycopg2: None
jinja2: 2.9.6
boto: 2.45.0
pandas_datareader: None
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17952869 entries, 2017-04-14 12:44:00 to 2016-06-02 14:25:00
Data columns (total 16 columns):
Off-Wrist Status int64
Activity float64
Marker float64
White Light float64
Red Light float64
Green Light float64
Blue Light float64
Sleep/Wake float64
Interval Status object
UID object
Group object
DayofWeek int32
Date object
GroupDayofWeek object
GroupDayType object
Weekend/Holiday bool
dtypes: bool(1), float64(7), int32(1), int64(1), object(6)
memory usage: 2.1+ GB
The problem variable is composed of str type only!
Franklin 2017 Fri <type 'str'>
Franklin 2017 Sat <type 'str'>
Franklin 2017 Sun <type 'str'>
Franklin 2017 Mon <type 'str'>
Franklin 2017 Tue <type 'str'>
Franklin 2017 Wed <type 'str'>
Franklin 2017 Thu <type 'str'>
Roosevelt 2017 Wed <type 'str'>
Roosevelt 2017 Thu <type 'str'>
Roosevelt 2017 Fri <type 'str'>
Roosevelt 2017 Sat <type 'str'>
Roosevelt 2017 Sun <type 'str'>
Roosevelt 2017 Mon <type 'str'>
Roosevelt 2017 Tue <type 'str'>
Roosevelt 2016 Wed <type 'str'>
Roosevelt 2016 Thu <type 'str'>
Roosevelt 2016 Fri <type 'str'>
Roosevelt 2016 Sat <type 'str'>
Roosevelt 2016 Sun <type 'str'>
Roosevelt 2016 Mon <type 'str'>
Roosevelt 2016 Tue <type 'str'>
Franklin 2016 Mon <type 'str'>
Franklin 2016 Tue <type 'str'>
Franklin 2016 Wed <type 'str'>
Franklin 2016 Thu <type 'str'>
Franklin 2016 Fri <type 'str'>
Franklin 2016 Sat <type 'str'>
Franklin 2016 Sun <type 'str'>
writing without GroupDayofWeek
Everything OK! Now writing with GroupDayofWeek
/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py:1138: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block4_values] [items->['Interval Status', 'UID', 'Group', 'Date', 'GroupDayType', 'GroupDayofWeek']]
return pytables.to_hdf(path_or_buf, key, self, **kwargs)
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-33-cafe787c9e79> in <module>()
10 print('Everything OK! Now writing with GroupDayofWeek')
11 jj['GroupDayofWeek']=allData['GroupDayofWeek']
---> 12 jj.to_hdf('testme.hdf','allData')
/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in to_hdf(self, path_or_buf, key, **kwargs)
1136
1137 from pandas.io import pytables
-> 1138 return pytables.to_hdf(path_or_buf, key, self, **kwargs)
1139
1140 def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
268 with HDFStore(path_or_buf, mode=mode, complevel=complevel,
269 complib=complib) as store:
--> 270 f(store)
271 else:
272 f(path_or_buf)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in <lambda>(store)
262 f = lambda store: store.append(key, value, **kwargs)
263 else:
--> 264 f = lambda store: store.put(key, value, **kwargs)
265
266 path_or_buf = _stringify_path(path_or_buf)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in put(self, key, value, format, append, **kwargs)
871 format = get_option("io.hdf.default_format") or 'fixed'
872 kwargs = self._validate_format(format, kwargs)
--> 873 self._write_to_group(key, value, append=append, **kwargs)
874
875 def remove(self, key, where=None, start=None, stop=None):
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
1313
1314 # write the object
-> 1315 s.write(obj=value, append=append, complib=complib, **kwargs)
1316
1317 if s.is_table and index:
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in write(self, obj, **kwargs)
2875 # I have no idea why, but writing values before items fixed #2299
2876 blk_items = data.items.take(blk.mgr_locs)
-> 2877 self.write_array('block%d_values' % i, blk.values, items=blk_items)
2878 self.write_index('block%d_items' % i, blk_items)
2879
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in write_array(self, key, value, items)
2643 vlarr = self._handle.create_vlarray(self.group, key,
2644 _tables().ObjectAtom())
-> 2645 vlarr.append(value)
2646 else:
2647 if empty_array:
/usr/lib/python2.7/dist-packages/tables/vlarray.pyc in append(self, sequence)
534 nparr = None
535
--> 536 self._append(nparr, nobjects)
537 self.nrows += 1
538
hdf5extension.pyx in tables.hdf5extension.VLArray._append (tables/hdf5extension.c:18236)()
OverflowError: value too large to convert to int
(Side note: like the others I thought this originally had to do with too large a dataset because the error implies that. It's easy to ignore a (common) warning when you've got a seemingly very different error. When there's a python object that can't be translated into a ctype I think the code should probably be more informative when it fails... of course that doesn't seem to be what's happening in my particular case, since str should be translatable)
Thanks in advance for taking the time to look at this!
I have a large dataset which includes a categorical variable where every level of that variable is a string. Removing that categorical variable allows .to_hdf() to function as expected, when the variable i
and on a fixed store this is not supported, therefor this is a complex object. try using format='table'
Jeff, Thanks for the reply. But format='table' doesn't work, see below.
Also, when you say "on a fixed store this is not supported, therefor this is a complex object." I don't understand... _what_ isn't supported? Having a column with dtype object filled with str ? Writing a dataframe where different columns have different dtypes?
To be clear: this isn't the only column of dtype object, filled with str, it's just the only one that causes an error.
Thanks again!
allData.to_hdf(outfile+'raw.hdf','allData',format='table')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-40-b9ae9ace3d83> in <module>()
----> 1 allData.to_hdf(outfile+'raw.hdf','allData',format='table')
/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in to_hdf(self, path_or_buf, key, **kwargs)
1136
1137 from pandas.io import pytables
-> 1138 return pytables.to_hdf(path_or_buf, key, self, **kwargs)
1139
1140 def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
268 with HDFStore(path_or_buf, mode=mode, complevel=complevel,
269 complib=complib) as store:
--> 270 f(store)
271 else:
272 f(path_or_buf)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in <lambda>(store)
262 f = lambda store: store.append(key, value, **kwargs)
263 else:
--> 264 f = lambda store: store.put(key, value, **kwargs)
265
266 path_or_buf = _stringify_path(path_or_buf)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in put(self, key, value, format, append, **kwargs)
871 format = get_option("io.hdf.default_format") or 'fixed'
872 kwargs = self._validate_format(format, kwargs)
--> 873 self._write_to_group(key, value, append=append, **kwargs)
874
875 def remove(self, key, where=None, start=None, stop=None):
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
1313
1314 # write the object
-> 1315 s.write(obj=value, append=append, complib=complib, **kwargs)
1316
1317 if s.is_table and index:
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, **kwargs)
3850 self.create_axes(axes=axes, obj=obj, validate=append,
3851 min_itemsize=min_itemsize,
-> 3852 **kwargs)
3853
3854 for a in self.axes:
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.pyc in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3522 self.values_axes.append(col)
3523 except (NotImplementedError, ValueError, TypeError) as e:
-> 3524 raise e
3525 except Exception as detail:
3526 raise Exception(
TypeError: Cannot serialize the column [Date] because
its data contents are [date] object dtype
TypeError: Cannot serialize the column [Date] because
its data contents are [date] object dtype
datetime.date object are generally unsupported in pandas itself. You cannot serialize these except by pickling and they are slow. simply use fully dupposrted datetime64[ns] dtype columns. You have to be circumspect about arbitrary objects that exist in an object column. These are really only meant to hold actual strings. (but in memory can hold anything), serialization is trickier.