import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data = data.astype({'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')
to_parquet
tries to convert an object
column to int64
. This happens when using either engine but is clearly seen when using data.to_parquet('example.parquet', engine='fastparquet')
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
447 if selement.type == parquet_thrift.Type.INT64:
--> 448 data = data.astype(int)
449 elif selement.type == parquet_thrift.Type.BOOLEAN:
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
176 kwargs[new_arg_name] = new_arg_value
--> 177 return func(*args, **kwargs)
178 return wrapper
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
4996 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4997 **kwargs)
4998 return self._constructor(new_data).__finalize__(self)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
3713 def astype(self, dtype, **kwargs):
-> 3714 return self.apply('astype', dtype=dtype, **kwargs)
3715
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3580 kwargs['mgr'] = self
-> 3581 applied = getattr(b, f)(**kwargs)
3582 result_blocks = _extend_blocks(applied, result_blocks)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
574 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 575 **kwargs)
576
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
663 # _astype_nansafe works fine with 1-d only
--> 664 values = astype_nansafe(values.ravel(), dtype, copy=True)
665 values = values.reshape(self.shape)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
708 if np.issubdtype(dtype.type, np.integer):
--> 709 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
710
pandas\_libs\lib.pyx in pandas._libs.lib.astype_intsafe()
pandas/_libs/src\util.pxd in util.set_value_at_unsafe()
ValueError: invalid literal for int() with base 10: 'Z31'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-17-6bc14a88da64> in <module>()
----> 1 data.to_parquet('example.parquet', engine='fastparquet')
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\frame.py in to_parquet(self, fname, engine, compression, **kwargs)
1940 from pandas.io.parquet import to_parquet
1941 to_parquet(self, fname, engine,
-> 1942 compression=compression, **kwargs)
1943
1944 @Substitution(header='Write out the column names. If a list of strings '
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in to_parquet(df, path, engine, compression, **kwargs)
255 """
256 impl = get_engine(engine)
--> 257 return impl.write(df, path, compression=compression, **kwargs)
258
259
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in write(self, df, path, compression, **kwargs)
216 with catch_warnings(record=True):
217 self.api.write(path, df,
--> 218 compression=compression, **kwargs)
219
220 def read(self, path, columns=None, **kwargs):
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times)
846 if file_scheme == 'simple':
847 write_simple(filename, data, fmd, row_group_offsets,
--> 848 compression, open_with, has_nulls, append)
849 elif file_scheme in ['hive', 'drill']:
850 if append:
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_simple(fn, data, fmd, row_group_offsets, compression, open_with, has_nulls, append)
715 else None)
716 rg = make_row_group(f, data[start:end], fmd.schema,
--> 717 compression=compression)
718 if rg is not None:
719 fmd.row_groups.append(rg)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in make_row_group(f, data, schema, compression)
612 comp = compression
613 chunk = write_column(f, data[column.name], column,
--> 614 compression=comp)
615 rg.columns.append(chunk)
616 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
453 raise ValueError('Error converting column "%s" to primitive '
454 'type %s. Original error: '
--> 455 '%s' % (data.name, t, e))
456
457 else:
ValueError: Error converting column "B" to primitive type INT64. Original error: invalid literal for int() with base 10: 'Z31'
You can see that it is a mixed type column issue if you use to_csv
and read_csv
to load data from csv file instead - you get the following warning on import:
C:\Users\I347500\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
Specifying dtype
option solves the issue but it isn't convenient that there is no way to set column types after loading the data. It is also strange that to_parquet
tries to infer column types instead of using dtypes as stated in .dtypes
or .info()
to_parquet
tries write parquet file using dtypes as specified
pd.show_versions()
commit: None
python: 3.6.5.final.0
python-bits: 64
OS: Windows
OS-release: 10
machine: AMD64
processor: Intel64 Family 6 Model 61 Stepping 4, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None
pandas: 0.23.0
pytest: 3.5.1
pip: 10.0.1
setuptools: 39.1.0
Cython: None
numpy: 1.14.3
scipy: 1.1.0
pyarrow: 0.9.0
xarray: None
IPython: 6.4.0
sphinx: None
patsy: 0.5.0
dateutil: 2.7.3
pytz: 2018.4
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.2.2
openpyxl: None
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.4
lxml: None
bs4: None
html5lib: 0.9999999
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: 0.1.5
pandas_gbq: None
pandas_datareader: None
Using the latest pyarrow
master, this may already been fixed. You could try to check if the problem still persists once you install pyarrow
from the twosigma
channel (conda install -c twosigma pyarrow
).
Specifying dtype option solves the issue but it isn't convenient that there is no way to set column types after loading the data. It is also strange that to_parquet tries to infer column types instead of using dtypes as stated in .dtypes or .info()
The dtypes that are returned by Pandas as not as detailed as those supported and used by Parquet. For example Pandas has the very generic type of object
. This cannot be saved to Parquet as Parquet is language-agnostic, thus Python objects are not a valid type. Therefore for object
columns one must look at the actual data and infer a more specific type.
Using the latest pyarrow master, this may already been fixed. You could try to check if the problem still persists once you install pyarrow from the twosigma channel (conda install -c twosigma pyarrow).
It is available for Linux only. I'll try to experiment on Linux server but it may take some time.
The dtypes that are returned by Pandas as not as detailed as those supported and used by Parquet. For example Pandas has the very generic type of object. This cannot be saved to Parquet as Parquet is language-agnostic, thus Python objects are not a valid type. Therefore for object columns one must look at the actual data and infer a more specific type.
Why does the following code work then?
import pandas as pd
data = pd.read_csv('pandas_example.csv', dtype = {'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')
@xhochy
OK, finally got to experiment on Linux server.
pyarrow-0.9.0+254
, import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data.to_parquet('example.parquet')
still gives ArrowTypeError: an integer is required (got type str)
@Ingvar-Y Finally I had some time to look at the data. The problem here is that you have partly strings, partly integer values. What would be the expected type when writing this column? Note that Arrow and Pandas can only have columns of a single type.
@xhochy It is a string type column that unfortunately has a lot of integer-like values but the expected type is definitely string.
IMHO, there should be an option to write a column with a string type even if all the values inside are integers - for example, to maintain consistency of column types among multiple files. This is not the case for my example - column B can't have integer type.
re ' you have partly strings, partly integer values. What would be the expected type when writing this column?'
I would expect it to be a string.
We could have some mechanism to indicate "this column should have a string type in the final parquet file", like we have a dtype
argument for to_sql
(you can actually already do something like manually this by passing the schema
argument).
However, the problem is that the arrow functions that convert numpy arrays to arrow arrays still give errors for mixed string / integer types, even if you indicate that it should be strings, eg:
In [7]: pyarrow.array(np.array(['a', 1, 'b'], dtype=object), type=pyarrow.string())
...
ArrowInvalid: Error converting from Python objects to String/UTF8: Got Python object of type int but can only handle these types: str, bytes
So unless that is something arrow would want to change (but personally I would not do that), this would not help for the specific example case in this issue.
We could of course still do a conversion on the pandas side, but that would need to be rather custom logic (and a user can do df.astype({'col': str}).to_parquet(..)
themselves before writing to parquet).
So I think we can close this issue.
In my case, I had read in multiple csv's and done pandas.concat()
.
Some read in as float and others as string. pandas.concat()
stuck them together without any warnings, and the problem became apparent when to_parquet()
complained.
So in that case at least, it may be more an issue with concat()
than with to_parquet()
IMHO we should close this since it's giving people the wrong impression that parquet "can't handle mixed type columns", e.g. "hey ,they have an open issue with this title" (without a clear resolution at the end of the thread).
As @jorisvandenbossche mentioned, the OP's problem is type inference when doing pd.read_excel()
. It has nothing to do with to_parquet
, and as he pointed out, the user can always do df.astype({'col': str}).to_parquet(..)
to manage and mix types as needed.
agree here - closing as a usage issue
I know this is a closed issue, but in case someone looks for a patch, here is what worked for me:
for c in df.columns:
#did not work
#coltype = df[c].dtype
#did not work either
#coltype = 'O'
#this one works:
#get type of first valid value of the column
try:
coltype = type(df[c].dropna().iloc[0])
except IndexError as e:
#column is composed on only invalid values
coltype = float
df[c] = df[c].astype(coltype)
I needed this as I was dealing with a large dataframe (coming from openfoodfacts: https://world.openfoodfacts.org/data ), containing 1M lines and 177 columns of various types, and I simply could not manually cast each column.
@titsitits you might want to have a look at DataFrame.infer_objects
to see if this helps converting object dtypes to proper dtypes (although it will not do any forced conversions, eg no string number to an actual numeric dtype)
I realize that this has been closed for a while now, but as I'm revisiting this error, I wanted to share a possible hack around it (not that it's an ideal approach):
as @catawbasam mentioned:
re ' you have partly strings, partly integer values. What would be the expected type when writing this column?'
I would expect it to be a string.
I cast all my categorical columns into 'str' before writing as parquet (instead of specifying each column by name which can get cumbersome for 500 columns).
When I load it back into pandas, the type of the str column would be object
again.
Edit: If you happen to hit an error with NA's being hardcoded into 'None' after you convert your object columns into str, make sure to convert these NA's into np.nan before converting into str (stackoverflow link)
I solved this by:
for col in df.columns:
weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
if len(df[weird]) > 0:
print(col)
df[col] = df[col].astype(str)
if df[col].dtype == list:
df[col] = df[col].astype(str)
First, find out the mixed type column and convert them to string. Then find out list type column and convert them to string if not you may get pyarrow.lib.ArrowInvalid: Nested column branch had multiple children
Reference:https://stackoverflow.com/questions/29376026/whats-a-good-strategy-to-find-mixed-types-in-pandas-columns
https://stackoverflow.com/questions/50876505/does-any-python-library-support-writing-arrays-of-structs-to-parquet-files
Most helpful comment
re ' you have partly strings, partly integer values. What would be the expected type when writing this column?'
I would expect it to be a string.