Cannot serialise dataframe with FletcherArray columns
suvayu opened this issue · comments
It seems serialisation is not yet supported by FletcherArray
. Here's my DataFrame
:
>>> df_amd.dtypes
time datetime64[ns]
evt_name fletcher[string]
evt_value float64
evt_unit fletcher[string]
bus uint64
route fletcher[string]
stop_code fletcher[int64]
stop fletcher[string]
lat float64
lon float64
dtype: object
And here are the backtraces when I try to serialise to various formats (.to_csv(..)
works):
HDF5
>>> df_amd.to_hdf('data/road_safety.h5', 'AMD')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/core/generic.py", line 1996, in to_hdf
return pytables.to_hdf(path_or_buf, key, self, **kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/pytables.py", line 279, in to_hdf
f(store)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/pytables.py", line 273, in <lambda>
f = lambda store: store.put(key, value, **kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/pytables.py", line 890, in put
self._write_to_group(key, value, append=append, **kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/pytables.py", line 1367, in _write_to_group
s.write(obj=value, append=append, complib=complib, **kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/pytables.py", line 2963, in write
self.write_array('block%d_values' % i, blk.values, items=blk_items)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/pytables.py", line 2686, in write_array
value = value.T
AttributeError: 'FletcherArray' object has no attribute 'T'
Parquet
>>> df_amd.to_parquet('data/ahmedabad_event_report.parquet')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 1945, in to_parquet
compression=compression, **kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/parquet.py", line 257, in to_parquet
return impl.write(df, path, compression=compression, **kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/parquet.py", line 118, in write
table = self.api.Table.from_pandas(df)
File "pyarrow/table.pxi", line 1136, in pyarrow.lib.Table.from_pandas
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 386, in dataframe_to_arrays
convert_types))
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 586, in result_iterator
yield fs.pop().result()
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 425, in result
return self.__get_result()
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
File "/usr/lib64/python3.6/concurrent/futures/thread.py", line 56, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 375, in convert_column
raise e
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 369, in convert_column
return pa.array(col, from_pandas=True, type=ty)
File "pyarrow/array.pxi", line 182, in pyarrow.lib.array
File "pyarrow/array.pxi", line 76, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column evt_name with type fletcher[string]')
Feather
>>> df = df_amd.reset_index()
>>> df.to_feather('data/ahmedabad_event_report.feather')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 1892, in to_feather
to_feather(self, fname)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/io/feather_format.py", line 83, in to_feather
feather.write_dataframe(df, path)
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/feather.py", line 181, in write_feather
writer.write(df)
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/feather.py", line 93, in write
batch = RecordBatch.from_pandas(df, preserve_index=False)
File "pyarrow/table.pxi", line 901, in pyarrow.lib.RecordBatch.from_pandas
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 386, in dataframe_to_arrays
convert_types))
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 586, in result_iterator
yield fs.pop().result()
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 425, in result
return self.__get_result()
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
File "/usr/lib64/python3.6/concurrent/futures/thread.py", line 56, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 375, in convert_column
raise e
File "/home/jallad/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 369, in convert_column
return pa.array(col, from_pandas=True, type=ty)
File "pyarrow/array.pxi", line 182, in pyarrow.lib.array
File "pyarrow/array.pxi", line 76, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column evt_name with type fletcher[string]')
Not sure if this is related, for some DataFrames
.memory_usage()
(consequently also .info()
) triggers the following backtrace:
>>> df_amd.memory_usage()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 2365, in memory_usage
for col, c in self.iteritems()], index=self.columns)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 2365, in <listcomp>
for col, c in self.iteritems()], index=self.columns)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/core/series.py", line 3503, in memory_usage
v = super(Series, self).memory_usage(deep=deep)
File "/home/jallad/.local/lib/python3.6/site-packages/pandas/core/base.py", line 1143, in memory_usage
v = self.values.nbytes
File "/home/jallad/.local/lib/python3.6/site-packages/fletcher/base.py", line 410, in nbytes
size += buf.size
AttributeError: 'NoneType' object has no attribute 'size'
FWIW, serializaiton of extension arrays isn't really supported in general yet pandas-dev/pandas#20612
Thanks @TomAugspurger, I wasn't aware. I will follow the upstream issue.
With the latest pyarrow
releases this should now be supported for extension arrays at least for writing feather
, arrow
and parquet
files. We should add tests these.
This project has been archived as development has ceased around 2021.
With the support of Apache Arrow-backed extension arrays in pandas
, the major goal of this project has been fulfilled.