Pandas: to_parquet can't handle mixed type columns

Created on 28 May 2018 · 14Comments · Source: pandas-dev/pandas

Code Sample, a copy-pastable example if possible

import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data = data.astype({'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')

pandas_example.xlsx

Problem description

to_parquet tries to convert an object column to int64. This happens when using either engine but is clearly seen when using data.to_parquet('example.parquet', engine='fastparquet')

ValueError                                Traceback (most recent call last)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
    447                 if selement.type == parquet_thrift.Type.INT64:
--> 448                     data = data.astype(int)
    449                 elif selement.type == parquet_thrift.Type.BOOLEAN:

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    176                     kwargs[new_arg_name] = new_arg_value
--> 177             return func(*args, **kwargs)
    178         return wrapper

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
   4996             new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4997                                          **kwargs)
   4998             return self._constructor(new_data).__finalize__(self)

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
   3713     def astype(self, dtype, **kwargs):
-> 3714         return self.apply('astype', dtype=dtype, **kwargs)
   3715 

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3580             kwargs['mgr'] = self
-> 3581             applied = getattr(b, f)(**kwargs)
   3582             result_blocks = _extend_blocks(applied, result_blocks)

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
    574         return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 575                             **kwargs)
    576 

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
    663                 # _astype_nansafe works fine with 1-d only
--> 664                 values = astype_nansafe(values.ravel(), dtype, copy=True)
    665                 values = values.reshape(self.shape)

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
    708         if np.issubdtype(dtype.type, np.integer):
--> 709             return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
    710 

pandas\_libs\lib.pyx in pandas._libs.lib.astype_intsafe()

pandas/_libs/src\util.pxd in util.set_value_at_unsafe()

ValueError: invalid literal for int() with base 10: 'Z31'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-17-6bc14a88da64> in <module>()
----> 1 data.to_parquet('example.parquet', engine='fastparquet')

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\frame.py in to_parquet(self, fname, engine, compression, **kwargs)
   1940         from pandas.io.parquet import to_parquet
   1941         to_parquet(self, fname, engine,
-> 1942                    compression=compression, **kwargs)
   1943 
   1944     @Substitution(header='Write out the column names. If a list of strings '

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in to_parquet(df, path, engine, compression, **kwargs)
    255     """
    256     impl = get_engine(engine)
--> 257     return impl.write(df, path, compression=compression, **kwargs)
    258 
    259 

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in write(self, df, path, compression, **kwargs)
    216         with catch_warnings(record=True):
    217             self.api.write(path, df,
--> 218                            compression=compression, **kwargs)
    219 
    220     def read(self, path, columns=None, **kwargs):

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times)
    846     if file_scheme == 'simple':
    847         write_simple(filename, data, fmd, row_group_offsets,
--> 848                      compression, open_with, has_nulls, append)
    849     elif file_scheme in ['hive', 'drill']:
    850         if append:

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_simple(fn, data, fmd, row_group_offsets, compression, open_with, has_nulls, append)
    715                    else None)
    716             rg = make_row_group(f, data[start:end], fmd.schema,
--> 717                                 compression=compression)
    718             if rg is not None:
    719                 fmd.row_groups.append(rg)

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in make_row_group(f, data, schema, compression)
    612                 comp = compression
    613             chunk = write_column(f, data[column.name], column,
--> 614                                  compression=comp)
    615             rg.columns.append(chunk)
    616     rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in

~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
    453                 raise ValueError('Error converting column "%s" to primitive '
    454                                  'type %s. Original error: '
--> 455                                  '%s' % (data.name, t, e))
    456 
    457     else:

ValueError: Error converting column "B" to primitive type INT64. Original error: invalid literal for int() with base 10: 'Z31'

You can see that it is a mixed type column issue if you use to_csv and read_csv to load data from csv file instead - you get the following warning on import:

C:\Users\I347500\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Specifying dtype option solves the issue but it isn't convenient that there is no way to set column types after loading the data. It is also strange that to_parquet tries to infer column types instead of using dtypes as stated in .dtypes or .info()

Expected Output

to_parquet tries write parquet file using dtypes as specified

Output of `pd.show_versions()`

INSTALLED VERSIONS

commit: None
python: 3.6.5.final.0
python-bits: 64
OS: Windows
OS-release: 10
machine: AMD64
processor: Intel64 Family 6 Model 61 Stepping 4, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None

pandas: 0.23.0
pytest: 3.5.1
pip: 10.0.1
setuptools: 39.1.0
Cython: None
numpy: 1.14.3
scipy: 1.1.0
pyarrow: 0.9.0
xarray: None
IPython: 6.4.0
sphinx: None
patsy: 0.5.0
dateutil: 2.7.3
pytz: 2018.4
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.2.2
openpyxl: None
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.4
lxml: None
bs4: None
html5lib: 0.9999999
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: 0.1.5
pandas_gbq: None
pandas_datareader: None

IO Parquet

Source

Ingvar-Y

Most helpful comment

re ' you have partly strings, partly integer values. What would be the expected type when writing this column?'
I would expect it to be a string.

catawbasam on 6 Sep 2018

👍14

All 14 comments

Using the latest pyarrow master, this may already been fixed. You could try to check if the problem still persists once you install pyarrow from the twosigma channel (conda install -c twosigma pyarrow).

Specifying dtype option solves the issue but it isn't convenient that there is no way to set column types after loading the data. It is also strange that to_parquet tries to infer column types instead of using dtypes as stated in .dtypes or .info()

The dtypes that are returned by Pandas as not as detailed as those supported and used by Parquet. For example Pandas has the very generic type of object. This cannot be saved to Parquet as Parquet is language-agnostic, thus Python objects are not a valid type. Therefore for object columns one must look at the actual data and infer a more specific type.

xhochy on 28 May 2018

Using the latest pyarrow master, this may already been fixed. You could try to check if the problem still persists once you install pyarrow from the twosigma channel (conda install -c twosigma pyarrow).

It is available for Linux only. I'll try to experiment on Linux server but it may take some time.

The dtypes that are returned by Pandas as not as detailed as those supported and used by Parquet. For example Pandas has the very generic type of object. This cannot be saved to Parquet as Parquet is language-agnostic, thus Python objects are not a valid type. Therefore for object columns one must look at the actual data and infer a more specific type.

Why does the following code work then?

import pandas as pd
data = pd.read_csv('pandas_example.csv', dtype = {'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')

Ingvar-Y on 29 May 2018

@xhochy
OK, finally got to experiment on Linux server.

There is still a weird issue with nightly builds. I had a similar problem with being unable to install 0.9.0+ arrow-cpp version as described here:
https://github.com/dask/dask/issues/3345
Workaround helped.
The problem with mixed type columns still exists in pyarrow-0.9.0+254,

import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data.to_parquet('example.parquet')

still gives ArrowTypeError: an integer is required (got type str)

Ingvar-Y on 7 Jun 2018

@Ingvar-Y Finally I had some time to look at the data. The problem here is that you have partly strings, partly integer values. What would be the expected type when writing this column? Note that Arrow and Pandas can only have columns of a single type.

xhochy on 8 Jun 2018

@xhochy It is a string type column that unfortunately has a lot of integer-like values but the expected type is definitely string.

IMHO, there should be an option to write a column with a string type even if all the values inside are integers - for example, to maintain consistency of column types among multiple files. This is not the case for my example - column B can't have integer type.

Ingvar-Y on 12 Jun 2018

👍2

re ' you have partly strings, partly integer values. What would be the expected type when writing this column?'
I would expect it to be a string.

catawbasam on 6 Sep 2018

👍14

We could have some mechanism to indicate "this column should have a string type in the final parquet file", like we have a dtype argument for to_sql (you can actually already do something like manually this by passing the schema argument).
However, the problem is that the arrow functions that convert numpy arrays to arrow arrays still give errors for mixed string / integer types, even if you indicate that it should be strings, eg:

In [7]: pyarrow.array(np.array(['a', 1, 'b'], dtype=object), type=pyarrow.string())
...
ArrowInvalid: Error converting from Python objects to String/UTF8: Got Python object of type int but can only handle these types: str, bytes

So unless that is something arrow would want to change (but personally I would not do that), this would not help for the specific example case in this issue.

We could of course still do a conversion on the pandas side, but that would need to be rather custom logic (and a user can do df.astype({'col': str}).to_parquet(..) themselves before writing to parquet).
So I think we can close this issue.

jorisvandenbossche on 6 Sep 2018

👍5 ❤3

In my case, I had read in multiple csv's and done pandas.concat().
Some read in as float and others as string. pandas.concat() stuck them together without any warnings, and the problem became apparent when to_parquet() complained.

So in that case at least, it may be more an issue with concat() than with to_parquet()

catawbasam on 6 Sep 2018

👍6 👀1

IMHO we should close this since it's giving people the wrong impression that parquet "can't handle mixed type columns", e.g. "hey ,they have an open issue with this title" (without a clear resolution at the end of the thread).

As @jorisvandenbossche mentioned, the OP's problem is type inference when doing pd.read_excel(). It has nothing to do with to_parquet, and as he pointed out, the user can always do df.astype({'col': str}).to_parquet(..) to manage and mix types as needed.

amelio-vazquez-reina on 12 Feb 2019

👍3

agree here - closing as a usage issue

jreback on 12 Feb 2019

I know this is a closed issue, but in case someone looks for a patch, here is what worked for me:

for c in df.columns:
    #did not work
    #coltype = df[c].dtype
    #did not work either
    #coltype = 'O'

    #this one works:
    #get type of first valid value of the column
    try:
        coltype = type(df[c].dropna().iloc[0])
    except IndexError as e:
        #column is composed on only invalid values
        coltype = float

    df[c] = df[c].astype(coltype)

I needed this as I was dealing with a large dataframe (coming from openfoodfacts: https://world.openfoodfacts.org/data ), containing 1M lines and 177 columns of various types, and I simply could not manually cast each column.

titsitits on 7 Aug 2019

👍3

@titsitits you might want to have a look at DataFrame.infer_objects to see if this helps converting object dtypes to proper dtypes (although it will not do any forced conversions, eg no string number to an actual numeric dtype)

jorisvandenbossche on 7 Aug 2019

I realize that this has been closed for a while now, but as I'm revisiting this error, I wanted to share a possible hack around it (not that it's an ideal approach):

as @catawbasam mentioned:

re ' you have partly strings, partly integer values. What would be the expected type when writing this column?'
I would expect it to be a string.

I cast all my categorical columns into 'str' before writing as parquet (instead of specifying each column by name which can get cumbersome for 500 columns).

When I load it back into pandas, the type of the str column would be object again.

Edit: If you happen to hit an error with NA's being hardcoded into 'None' after you convert your object columns into str, make sure to convert these NA's into np.nan before converting into str (stackoverflow link)

jadhosn on 4 Oct 2019

👍1

I solved this by:

for col in df.columns:
            weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
            if len(df[weird]) > 0:
                print(col)
                df[col] = df[col].astype(str)

            if df[col].dtype == list:
                df[col] = df[col].astype(str)

First, find out the mixed type column and convert them to string. Then find out list type column and convert them to string if not you may get pyarrow.lib.ArrowInvalid: Nested column branch had multiple children

Reference:https://stackoverflow.com/questions/29376026/whats-a-good-strategy-to-find-mixed-types-in-pandas-columns
https://stackoverflow.com/questions/50876505/does-any-python-library-support-writing-arrays-of-structs-to-parquet-files