Operating System: OSx 10.13.6
CPU/GPU model:
C++/Python/R version: Python 3.7.3
LightGBM version or commit hash: 2.2.3
* ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields ORIGINAL_LANGUAGE_Arabic, ORIGINAL_LANGUAGE_English, ...
import lightgbm as lgb
X = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
y = pd.Series(np.random.randint(0,100,size=(100)))
sX = X.to_sparse()
model_lgbm = lgb.LGBMClassifier()
model_lgbm.fit(sX, y)
Sparse[int64, nan], which is not included in PANDAS_DTYPE_MAPPER.@drkarthi
it seems your code can be run successfully:

Hmm, seems that this is pandas version-specific issue:
import numpy as np
import pandas as pd
import lightgbm as lgb
print(np.__version__)
print(pd.__version__)
print(lgb.__version__)
X = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
y = pd.Series(np.random.randint(0,100,size=(100)))
sX = X.to_sparse()
print(type(sX))
print(type(sX['A']))
print(type(sX['A'].values))
print(sX['A'].values)
model_lgbm = lgb.LGBMClassifier()
model_lgbm.fit(sX, y)
1.16.2
0.24.2
2.2.3
<class 'pandas.core.sparse.frame.SparseDataFrame'>
<class 'pandas.core.sparse.series.SparseSeries'>
<class 'pandas.core.arrays.sparse.SparseArray'>
[66, 54, 35, 27, 57, 48, 72, 29, 37, 19, 79, 17, 45, 43, 27, 48, 62, 10, 43, 40, 23, 31, 95, 39, 14, 57, 74, 65, 92, 23, 90, 59, 42, 11, 37, 96, 90, 13, 15, 12, 31, 39, 38, 10, 75, 27, 96, 58, 52, 34, 12, 24, 44, 67, 9, 7, 39, 21, 75, 52, 94, 77, 52, 68, 99, 45, 97, 33, 75, 28, 81, 66, 1, 32, 39, 17, 35, 39, 85, 42, 32, 48, 75, 21, 71, 85, 15, 46, 98, 45, 64, 26, 36, 67, 67, 99, 50, 64, 94, 22]
Fill: nan
BlockIndex
Block locations: array([0])
Block lengths: array([100])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-fd257865e91a> in <module>
18
19 model_lgbm = lgb.LGBMClassifier()
---> 20 model_lgbm.fit(sX, y)
C:\Miniconda3\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks)
742 verbose=verbose, feature_name=feature_name,
743 categorical_feature=categorical_feature,
--> 744 callbacks=callbacks)
745 return self
746
C:\Miniconda3\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks)
542 verbose_eval=verbose, feature_name=feature_name,
543 categorical_feature=categorical_feature,
--> 544 callbacks=callbacks)
545
546 if evals_result:
C:\Miniconda3\lib\site-packages\lightgbm\engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks)
195 # construct booster
196 try:
--> 197 booster = Booster(params=params, train_set=train_set)
198 if is_valid_contain_train:
199 booster.set_train_data_name(train_data_name)
C:\Miniconda3\lib\site-packages\lightgbm\basic.py in __init__(self, params, train_set, model_file, silent)
1550 self.handle = ctypes.c_void_p()
1551 _safe_call(_LIB.LGBM_BoosterCreate(
-> 1552 train_set.construct().handle,
1553 c_str(params_str),
1554 ctypes.byref(self.handle)))
C:\Miniconda3\lib\site-packages\lightgbm\basic.py in construct(self)
999 init_score=self.init_score, predictor=self._predictor,
1000 silent=self.silent, feature_name=self.feature_name,
-> 1001 categorical_feature=self.categorical_feature, params=self.params)
1002 if self.free_raw_data:
1003 self.data = None
C:\Miniconda3\lib\site-packages\lightgbm\basic.py in _lazy_init(self, data, label, reference, weight, group, init_score, predictor, silent, feature_name, categorical_feature, params)
727 feature_name,
728 categorical_feature,
--> 729 self.pandas_categorical)
730 label = _label_from_pandas(label)
731 self.data_has_header = False
C:\Miniconda3\lib\site-packages\lightgbm\basic.py in _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical)
275 msg = ("DataFrame.dtypes for data must be int, float or bool.\n"
276 "Did not expect the data types in fields ")
--> 277 raise ValueError(msg + ', '.join(bad_fields))
278 data = data.values.astype('float')
279 else:
ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields A, B, C, D
Looks like this was changed in pandas v0.24.0: https://github.com/pandas-dev/pandas/blob/v0.24.2/pandas/core/arrays/sparse.py#L505-L1770
The discussion around the breaking change in the implementation of SparseArray: https://github.com/pandas-dev/pandas/issues/21978#issuecomment-410471525
@drkarthi Thank you very much for useful info!
I see you have dug into the SparseArray implementation already. Would you mind creating a PR?
Sure!
@StrikerRUS could we close this issue?
@guolinke I think we can update pandas mapper:
Possible solutions:
1 - Look at the dtype.name or dtype.subtype depending on whether the name starts with "Sparse"
2 - Instead of enumerating the acceptable datatypes, check each column with pd.api.types.is_numeric_dtype() and .is_bool_dtype()
Most helpful comment
Sure!