Skip to content

BUG: stack() will raise IndexError when is_extension_array_dtype. bug is in function _stack_multi_columns() #43561

Closed
@david-shu

Description

@david-shu

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the master branch of pandas.

Reproducible Example

import pandas as pd

df = pd.DataFrame( { 'element': [ 't_mean','t_mean','t_mean','r','r','r' ],
		   'time': [ 1,2,3,1,2,3],
		   '54511': [226,215,232,50,10,305],
		   '54514': [215,220,220,10,9,111],
		   } )

df2 = df.set_index( ['element','time'] )
df2.columns.set_names( 'station', inplace=True )

df3 = df2.unstack( 'element' )
print( df3 )

df4 = df3.stack( 'station' )
# everything is ok until now.


# will raise IndexError : index 6 is out of bounds for axis 0 with size 6
df32 = df3.astype( pd.Int64Dtype() ) ### here
df42 = df32.stack( 'station' )

Issue Description

bug exists in pandas/core/reshape/reshape.py :

def _stack_multi_columns(frame, level_num=-1, dropna=True):

  def _convert_level_number(level_num, columns):
      """
      Logic for converting the level number to something we can safely pass
      to swaplevel.

      If `level_num` matches a column name return the name from
      position `level_num`, otherwise return `level_num`.
      """
      if level_num in columns.names:
          return columns.names[level_num]

      return level_num

  this = frame.copy()

  # this makes life much simpler
  if level_num != frame.columns.nlevels - 1:
      # roll levels to put selected level at end
      roll_columns = this.columns
      for i in range(level_num, frame.columns.nlevels - 1):
          # Need to check if the ints conflict with level names
          lev1 = _convert_level_number(i, roll_columns)
          lev2 = _convert_level_number(i + 1, roll_columns)
          roll_columns = roll_columns.swaplevel(lev1, lev2)
      this.columns = roll_columns

  if not this.columns._is_lexsorted():
      # Workaround the edge case where 0 is one of the column names,
      # which interferes with trying to sort based on the first
      # level
      level_to_sort = _convert_level_number(0, this.columns)
      this = this.sort_index(level=level_to_sort, axis=1)

  new_columns = _stack_multi_column_index(this.columns)

  # time to ravel the values
  new_data = {}
  level_vals = this.columns.levels[-1]
  level_codes = sorted(set(this.columns.codes[-1]))
  level_vals_nan = level_vals.insert(len(level_vals), None)

  level_vals_used = np.take(level_vals_nan, level_codes)
  levsize = len(level_codes)
  drop_cols = []
  for key in new_columns:
      
      try:
          loc = this.columns.get_loc(key)
      except KeyError:
          drop_cols.append(key)
          continue
      
      # can make more efficient?
      # we almost always return a slice
      # but if unsorted can get a boolean
      # indexer
      if not isinstance(loc, slice):
          slice_len = len(loc)
      else:
          slice_len = loc.stop - loc.start
      
      if slice_len != levsize:
          chunk = this.loc[:, this.columns[loc]]
          chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
          value_slice = chunk.reindex(columns=level_vals_used).values
      else:
          if frame._is_homogeneous_type and is_extension_array_dtype(
              frame.dtypes.iloc[0]
          ):
              dtype = this[this.columns[loc]].dtypes.iloc[0]
              subset = this[this.columns[loc]]
              value_slice = dtype.construct_array_type()._concat_same_type(
                  [x._values for _, x in subset.items()]
              )
              N, K = this.shape           #----------------> !!!! look at here, is it a bug?
              #N, K = subset.shape    #
              idx = np.arange(N * K).reshape(K, N).T.ravel()
              
              value_slice = value_slice.take(idx)

          elif frame._is_mixed_type:
              value_slice = this[this.columns[loc]].values
          else:
              value_slice = this.values[:, loc]

      if value_slice.ndim > 1:
          # i.e. not extension
          value_slice = value_slice.ravel()

      new_data[key] = value_slice

  if len(drop_cols) > 0:
      new_columns = new_columns.difference(drop_cols)

  N = len(this)

  if isinstance(this.index, MultiIndex):
      new_levels = list(this.index.levels)
      new_names = list(this.index.names)
      new_codes = [lab.repeat(levsize) for lab in this.index.codes]
  else:
      old_codes, old_levels = factorize_from_iterable(this.index)
      new_levels = [old_levels]
      new_codes = [old_codes.repeat(levsize)]
      new_names = [this.index.name]  # something better?

  new_levels.append(level_vals)
  new_codes.append(np.tile(level_codes, N))
  new_names.append(frame.columns.names[level_num])

  new_index = MultiIndex(
      levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
  )

  result = frame._constructor(new_data, index=new_index, columns=new_columns)

  # more efficient way to go about this? can do the whole masking biz but
  # will only save a small amount of time...
  if dropna:
      result = result.dropna(axis=0, how="all")

return result

Expected Behavior

Don't throw exception

Installed Versions

pandas 1.3.3

Metadata

Metadata

Assignees

No one assigned

    Labels

    BugExtensionArrayExtending pandas with custom dtypes or arrays.ReshapingConcat, Merge/Join, Stack/Unstack, Explode

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions