Closed
Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the master branch of pandas.
Reproducible Example
import pandas as pd
df = pd.DataFrame( { 'element': [ 't_mean','t_mean','t_mean','r','r','r' ],
'time': [ 1,2,3,1,2,3],
'54511': [226,215,232,50,10,305],
'54514': [215,220,220,10,9,111],
} )
df2 = df.set_index( ['element','time'] )
df2.columns.set_names( 'station', inplace=True )
df3 = df2.unstack( 'element' )
print( df3 )
df4 = df3.stack( 'station' )
# everything is ok until now.
# will raise IndexError : index 6 is out of bounds for axis 0 with size 6
df32 = df3.astype( pd.Int64Dtype() ) ### here
df42 = df32.stack( 'station' )
Issue Description
bug exists in pandas/core/reshape/reshape.py :
def _stack_multi_columns(frame, level_num=-1, dropna=True):
def _convert_level_number(level_num, columns):
"""
Logic for converting the level number to something we can safely pass
to swaplevel.
If `level_num` matches a column name return the name from
position `level_num`, otherwise return `level_num`.
"""
if level_num in columns.names:
return columns.names[level_num]
return level_num
this = frame.copy()
# this makes life much simpler
if level_num != frame.columns.nlevels - 1:
# roll levels to put selected level at end
roll_columns = this.columns
for i in range(level_num, frame.columns.nlevels - 1):
# Need to check if the ints conflict with level names
lev1 = _convert_level_number(i, roll_columns)
lev2 = _convert_level_number(i + 1, roll_columns)
roll_columns = roll_columns.swaplevel(lev1, lev2)
this.columns = roll_columns
if not this.columns._is_lexsorted():
# Workaround the edge case where 0 is one of the column names,
# which interferes with trying to sort based on the first
# level
level_to_sort = _convert_level_number(0, this.columns)
this = this.sort_index(level=level_to_sort, axis=1)
new_columns = _stack_multi_column_index(this.columns)
# time to ravel the values
new_data = {}
level_vals = this.columns.levels[-1]
level_codes = sorted(set(this.columns.codes[-1]))
level_vals_nan = level_vals.insert(len(level_vals), None)
level_vals_used = np.take(level_vals_nan, level_codes)
levsize = len(level_codes)
drop_cols = []
for key in new_columns:
try:
loc = this.columns.get_loc(key)
except KeyError:
drop_cols.append(key)
continue
# can make more efficient?
# we almost always return a slice
# but if unsorted can get a boolean
# indexer
if not isinstance(loc, slice):
slice_len = len(loc)
else:
slice_len = loc.stop - loc.start
if slice_len != levsize:
chunk = this.loc[:, this.columns[loc]]
chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
value_slice = chunk.reindex(columns=level_vals_used).values
else:
if frame._is_homogeneous_type and is_extension_array_dtype(
frame.dtypes.iloc[0]
):
dtype = this[this.columns[loc]].dtypes.iloc[0]
subset = this[this.columns[loc]]
value_slice = dtype.construct_array_type()._concat_same_type(
[x._values for _, x in subset.items()]
)
N, K = this.shape #----------------> !!!! look at here, is it a bug?
#N, K = subset.shape #
idx = np.arange(N * K).reshape(K, N).T.ravel()
value_slice = value_slice.take(idx)
elif frame._is_mixed_type:
value_slice = this[this.columns[loc]].values
else:
value_slice = this.values[:, loc]
if value_slice.ndim > 1:
# i.e. not extension
value_slice = value_slice.ravel()
new_data[key] = value_slice
if len(drop_cols) > 0:
new_columns = new_columns.difference(drop_cols)
N = len(this)
if isinstance(this.index, MultiIndex):
new_levels = list(this.index.levels)
new_names = list(this.index.names)
new_codes = [lab.repeat(levsize) for lab in this.index.codes]
else:
old_codes, old_levels = factorize_from_iterable(this.index)
new_levels = [old_levels]
new_codes = [old_codes.repeat(levsize)]
new_names = [this.index.name] # something better?
new_levels.append(level_vals)
new_codes.append(np.tile(level_codes, N))
new_names.append(frame.columns.names[level_num])
new_index = MultiIndex(
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
)
result = frame._constructor(new_data, index=new_index, columns=new_columns)
# more efficient way to go about this? can do the whole masking biz but
# will only save a small amount of time...
if dropna:
result = result.dropna(axis=0, how="all")
return result
Expected Behavior
Don't throw exception
Installed Versions
pandas 1.3.3