The following code
import numpy as np
import pandas as pd
df1 = pd.DataFrame({'col1': ['a'], 'col2': ['b'], 'col3': ['15/12/2001']})
df2 = pd.DataFrame()
df2['col1'] = df1.col1
df2['col2'] = df1.col2
df2['col3'] = df1.col3
def convert(r):
return 'a', 'b'
df2.apply(convert, axis=1)
outputs 0 (a, b) dtype: object
In contrast, the following code
import numpy as np
import pandas as pd
df1 = pd.DataFrame({'col1': ['a'], 'col2': ['b'], 'col3': ['15/12/2001']})
df2 = pd.DataFrame()
df2['col1'] = df1.col1
df2['col2'] = df1.col2
df2['col3'] = pd.to_datetime(df1['col3'], dayfirst=True)
def convert(r):
return 'a', 'b'
df2.apply(convert, axis=1)
raises an error :
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
3928 blocks = form_blocks(arrays, names, axes)
-> 3929 mgr = BlockManager(blocks, axes)
3930 mgr._consolidate_inplace()
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check, fastpath)
2536 if do_integrity_check:
-> 2537 self._verify_integrity()
2538
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/internals.py in _verify_integrity(self)
2746 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 2747 construction_error(tot_items, block.shape[1:], self.axes)
2748 if len(self.items) != tot_items:
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
3898 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 3899 passed, implied))
3900
ValueError: Shape of passed values is (1, 2), indices imply (1, 3)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-93-e8fcbf9d6e00> in <module>()
12 return 'a', 'b'
13
---> 14 df2.apply(convert, axis=1)
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4040 if reduce is None:
4041 reduce = True
-> 4042 return self._apply_standard(f, axis, reduce=reduce)
4043 else:
4044 return self._apply_broadcast(f, axis)
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4153 index = None
4154
-> 4155 result = self._constructor(data=results, index=index)
4156 result.columns = res_index
4157
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
221 dtype=dtype, copy=copy)
222 elif isinstance(data, dict):
--> 223 mgr = self._init_dict(data, index, columns, dtype=dtype)
224 elif isinstance(data, ma.MaskedArray):
225 import numpy.ma.mrecords as mrecords
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
357 arrays = [data[k] for k in keys]
358
--> 359 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
360
361 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5248 axes = [_ensure_index(columns), _ensure_index(index)]
5249
-> 5250 return create_block_manager_from_arrays(arrays, arr_names, axes)
5251
5252
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
3931 return mgr
3932 except ValueError as e:
-> 3933 construction_error(len(arrays), arrays[0].shape, axes, e)
3934
3935
/home/michel/.virtualenvs/stsisi/lib/python3.4/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
3897 raise ValueError("Empty data passed with indices specified.")
3898 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 3899 passed, implied))
3900
3901
ValueError: Shape of passed values is (1, 2), indices imply (1, 3)
It looks to me like a bug involving pandas.to_datetime() and pandas.DataFrame.apply(). Am I missing something ?
I ran it with python 3.4.3 and pandas 0.18.0
Answer
The error you're encountering happens because of a mismatch between the number of columns in your DataFrame and the structure of the values returned by the convert
function when you use apply()
.
In the second code snippet, you are applying the convert
function to df2
, which has three columns (col1
, col2
, col3
), but the convert
function is returning a tuple with only two elements ('a', 'b'
). This causes pandas to throw a ValueError
, because pandas expects the result of the apply
function to have the same number of elements as the original DataFrame, i.e., it expects a result with three columns, but it only receives a result with two elements.
Why This Happens:
- In the first case, when all columns in
df2
are strings, the result ofapply()
can be a simple tuple with two elements ('a', 'b'
), which pandas can handle, because no type conversion or shape mismatch occurs. - In the second case, when one of the columns (
col3
) is a datetime object (after usingpd.to_datetime()
), theapply()
function tries to align the output of theconvert
function with the DataFrame. Since theconvert
function returns only two values ('a'
and'b'
), butdf2
expects three values (forcol1
,col2
, andcol3
), pandas raises theValueError
.
Fixing the Issue:
To fix this, you need to ensure that the convert
function always returns the same number of elements as there are columns in the DataFrame.
Here are two approaches you can take:
1. Modify the convert
function to return three values (one for each column):
import numpy as np
import pandas as pd
df1 = pd.DataFrame({'col1': ['a'], 'col2': ['b'], 'col3': ['15/12/2001']})
df2 = pd.DataFrame()
df2['col1'] = df1.col1
df2['col2'] = df1.col2
df2['col3'] = pd.to_datetime(df1['col3'], dayfirst=True)
def convert(r):
return 'a', 'b', 'c' # Returning 3 values
df2.apply(convert, axis=1)
This way, convert
returns a tuple with three elements ('a'
, 'b'
, and 'c'
), which matches the three columns in df2
, and the code will work without throwing an error.
2. Use pd.Series
to ensure the result matches the DataFrame's column count:
You can also return a pandas.Series
from the convert
function to ensure that the output aligns correctly with the columns of the DataFrame. This is useful if you want to return a variable number of values or if the function involves more complex operations.
import numpy as np
import pandas as pd
df1 = pd.DataFrame({'col1': ['a'], 'col2': ['b'], 'col3': ['15/12/2001']})
df2 = pd.DataFrame()
df2['col1'] = df1.col1
df2['col2'] = df1.col2
df2['col3'] = pd.to_datetime(df1['col3'], dayfirst=True)
def convert(r):
return pd.Series(['a', 'b', 'c'], index=df2.columns) # Match the columns
df2.apply(convert, axis=1)
This approach ensures that the returned pd.Series
matches the columns of the DataFrame (col1
, col2
, col3
), and pandas will align the values correctly.
Conclusion:
The core issue is that the output of your convert
function doesn't match the number of columns in the DataFrame (df2
). To fix the error, make sure that the convert
function either returns the same number of values as the columns in df2
or returns a pd.Series
that matches the column names.
Let me know if you need further clarification or additional help!