Skip to content

Commit b7492fe

Browse files
adgaudiojreback
authored andcommitted
ENH: Index(...) constructor creates a MultiIndex when appropriate.
- Series and DataFrame constructor autodetect when index/columns should be MultiIndex - prevents some seg faults in calls to cython funcs - add tupleize_cols kwarg and update tests to git PR comments - support name= xor names= in Index(tuples, ....) constructor - docs BUG: Index.identical(other) didn't compare type(other) to type(self)
1 parent f8c566c commit b7492fe

File tree

12 files changed

+133
-29
lines changed

12 files changed

+133
-29
lines changed

doc/source/basics.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,7 @@ This is equivalent to the following
761761
762762
.. _basics.reindexing:
763763

764+
764765
Reindexing and altering labels
765766
------------------------------
766767

doc/source/indexing.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,15 +1643,21 @@ can think of ``MultiIndex`` an array of tuples where each tuple is unique. A
16431643
``MultiIndex`` can be created from a list of arrays (using
16441644
``MultiIndex.from_arrays``), an array of tuples (using
16451645
``MultiIndex.from_tuples``), or a crossed set of iterables (using
1646-
``MultiIndex.from_product``).
1646+
``MultiIndex.from_product``). The ``Index`` constructor will attempt to return
1647+
a ``MultiIndex`` when it is passed a list of tuples. The following examples
1648+
demo different ways to initialize MultiIndexes.
1649+
16471650

16481651
.. ipython:: python
16491652
16501653
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
16511654
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
16521655
tuples = list(zip(*arrays))
16531656
tuples
1654-
index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
1657+
1658+
multi_index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
1659+
multi_index
1660+
16551661
s = Series(randn(8), index=index)
16561662
s
16571663

doc/source/release.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ pandas 0.14.0
5353
New features
5454
~~~~~~~~~~~~
5555

56+
- ``Index`` returns a MultiIndex if passed a list of tuples
57+
``DataFrame(dict)`` and ``Series(dict)`` create ``MultiIndex``
58+
columns and index where applicable (:issue:`4187`)
5659
- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
5760
- Added the ``sym_diff`` method to ``Index`` (:issue:`5543`)
5861
- Added ``to_julian_date`` to ``TimeStamp`` and ``DatetimeIndex``. The Julian
@@ -264,6 +267,8 @@ Bug Fixes
264267
~~~~~~~~~
265268

266269
- Bug in Series ValueError when index doesn't match data (:issue:`6532`)
270+
- Prevent segfault due to MultiIndex not being supported in HDFStore table
271+
format (:issue:`1848`)
267272
- Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`)
268273
- Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`)
269274
- Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`)

doc/source/v0.14.0.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,18 @@ Deprecations
405405
Enhancements
406406
~~~~~~~~~~~~
407407

408+
- DataFrame and Series will create MultiIndex if passed a list of tuples
409+
410+
.. ipython:: python
411+
412+
Series({('a', 'b'): 1, ('a', 'a'): 0,
413+
('a', 'c'): 2, ('b', 'a'): 3, ('b', 'b'): 4})
414+
pandas.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
415+
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
416+
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
417+
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
418+
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
419+
408420
- ``DataFrame.to_latex`` now takes a longtable keyword, which if True will return a table in a longtable environment. (:issue:`6617`)
409421
- ``pd.read_clipboard`` will, if 'sep' is unspecified, try to detect data copied from a spreadsheet
410422
and parse accordingly. (:issue:`6223`)

pandas/core/frame.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -317,9 +317,9 @@ def _init_dict(self, data, index, columns, dtype=None):
317317
else:
318318
keys = list(data.keys())
319319
if not isinstance(data, OrderedDict):
320-
keys = _try_sort(list(data.keys()))
320+
keys = _try_sort(keys)
321321
columns = data_names = Index(keys)
322-
arrays = [data[k] for k in columns]
322+
arrays = [data[k] for k in keys]
323323

324324
return _arrays_to_mgr(arrays, data_names, index, columns,
325325
dtype=dtype)
@@ -4496,7 +4496,7 @@ def extract_index(data):
44964496
index = None
44974497
if len(data) == 0:
44984498
index = Index([])
4499-
elif len(data) > 0 and index is None:
4499+
elif len(data) > 0:
45004500
raw_lengths = []
45014501
indexes = []
45024502

pandas/core/groupby.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,8 +1164,7 @@ def groups(self):
11641164
else:
11651165
to_groupby = lzip(*(ping.grouper for ping in self.groupings))
11661166
to_groupby = Index(to_groupby)
1167-
1168-
return self.axis.groupby(to_groupby)
1167+
return self.axis.groupby(to_groupby.values)
11691168

11701169
@cache_readonly
11711170
def group_info(self):

pandas/core/index.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ class Index(IndexOpsMixin, FrozenNDArray):
7171
Make a copy of input ndarray
7272
name : object
7373
Name to be stored in the index
74+
tupleize_cols : bool (default: True)
75+
When True, attempt to create a MultiIndex if possible
7476
7577
Notes
7678
-----
@@ -99,7 +101,7 @@ class Index(IndexOpsMixin, FrozenNDArray):
99101
_engine_type = _index.ObjectEngine
100102

101103
def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False,
102-
**kwargs):
104+
tupleize_cols=True, **kwargs):
103105

104106
# no class inference!
105107
if fastpath:
@@ -139,8 +141,19 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False,
139141

140142
elif np.isscalar(data):
141143
cls._scalar_data_error(data)
142-
143144
else:
145+
if tupleize_cols and isinstance(data, list) and data:
146+
try:
147+
sorted(data)
148+
has_mixed_types = False
149+
except (TypeError, UnicodeDecodeError):
150+
has_mixed_types = True # python3 only
151+
if isinstance(data[0], tuple) and not has_mixed_types:
152+
try:
153+
return MultiIndex.from_tuples(
154+
data, names=name or kwargs.get('names'))
155+
except (TypeError, KeyError):
156+
pass # python2 - MultiIndex fails on mixed types
144157
# other iterable of some kind
145158
subarr = com._asarray_tuplesafe(data, dtype=object)
146159

@@ -808,7 +821,8 @@ def identical(self, other):
808821
"""
809822
return (self.equals(other) and
810823
all((getattr(self, c, None) == getattr(other, c, None)
811-
for c in self._comparables)))
824+
for c in self._comparables)) and
825+
type(self) == type(other))
812826

813827
def asof(self, label):
814828
"""
@@ -1743,11 +1757,11 @@ def insert(self, loc, item):
17431757
-------
17441758
new_index : Index
17451759
"""
1746-
index = np.asarray(self)
1747-
# because numpy is fussy with tuples
1748-
item_idx = Index([item], dtype=index.dtype)
1749-
new_index = np.concatenate((index[:loc], item_idx, index[loc:]))
1750-
return Index(new_index, name=self.name)
1760+
_self = np.asarray(self)
1761+
item_idx = Index([item], dtype=self.dtype).values
1762+
idx = np.concatenate(
1763+
(_self[:loc], item_idx, _self[loc:]))
1764+
return Index(idx, name=self.name)
17511765

17521766
def drop(self, labels):
17531767
"""

pandas/core/series.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
_values_from_object,
2323
_possibly_cast_to_datetime, _possibly_castable,
2424
_possibly_convert_platform,
25+
_try_sort,
2526
ABCSparseArray, _maybe_match_name,
2627
_ensure_object, SettingWithCopyError)
27-
2828
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
2929
_ensure_index)
3030
from pandas.core.indexing import (
@@ -180,7 +180,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
180180
if isinstance(data, OrderedDict):
181181
index = Index(data)
182182
else:
183-
index = Index(sorted(data))
183+
index = Index(_try_sort(data))
184184
try:
185185
if isinstance(index, DatetimeIndex):
186186
# coerce back to datetime objects for lookup

pandas/src/inference.pyx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ def infer_dtype(object _values):
5858
_values = list(_values)
5959
values = list_to_object_array(_values)
6060

61+
values = getattr(values, 'values', values)
62+
6163
val_kind = values.dtype.type
6264
if val_kind in _TYPE_MAP:
6365
return _TYPE_MAP[val_kind]
@@ -1029,6 +1031,8 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
10291031
# kludge, for Series
10301032
return np.empty(0, dtype='f8')
10311033

1034+
keys = getattr(keys, 'values', keys)
1035+
10321036
for i in range(n):
10331037
val = util.get_value_1d(keys, i)
10341038
if val in mapping:

pandas/tests/test_frame.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -181,12 +181,12 @@ def test_getitem_list(self):
181181
# tuples
182182
df = DataFrame(randn(8, 3),
183183
columns=Index([('foo', 'bar'), ('baz', 'qux'),
184-
('peek', 'aboo')], name='sth'))
184+
('peek', 'aboo')], name=['sth', 'sth2']))
185185

186186
result = df[[('foo', 'bar'), ('baz', 'qux')]]
187187
expected = df.ix[:, :2]
188188
assert_frame_equal(result, expected)
189-
self.assertEqual(result.columns.name, 'sth')
189+
self.assertEqual(result.columns.names, ['sth', 'sth2'])
190190

191191
def test_setitem_list(self):
192192

@@ -2499,6 +2499,31 @@ def test_constructor_dict_of_tuples(self):
24992499
expected = DataFrame(dict((k, list(v)) for k, v in compat.iteritems(data)))
25002500
assert_frame_equal(result, expected, check_dtype=False)
25012501

2502+
def test_constructor_dict_multiindex(self):
2503+
check = lambda result, expected: tm.assert_frame_equal(
2504+
result, expected, check_dtype=True, check_index_type=True,
2505+
check_column_type=True, check_names=True)
2506+
d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2},
2507+
('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4},
2508+
('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}}
2509+
_d = sorted(d.items())
2510+
df = DataFrame(d)
2511+
expected = DataFrame(
2512+
[x[1] for x in _d],
2513+
index=MultiIndex.from_tuples([x[0] for x in _d])).T
2514+
expected.index = MultiIndex.from_tuples(expected.index)
2515+
check(df, expected)
2516+
2517+
d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111}
2518+
_d.insert(0, ('z', d['z']))
2519+
expected = DataFrame(
2520+
[x[1] for x in _d],
2521+
index=Index([x[0] for x in _d], tupleize_cols=False)).T
2522+
expected.index = Index(expected.index, tupleize_cols=False)
2523+
df = DataFrame(d)
2524+
df = df.reindex(columns=expected.columns, index=expected.index)
2525+
check(df, expected)
2526+
25022527
def _check_basic_constructor(self, empty):
25032528
"mat: 2d matrix with shpae (3, 2) to input. empty - makes sized objects"
25042529
mat = empty((2, 3), dtype=float)
@@ -2922,8 +2947,8 @@ class CustomDict(dict):
29222947
def test_constructor_ragged(self):
29232948
data = {'A': randn(10),
29242949
'B': randn(8)}
2925-
assertRaisesRegexp(ValueError, 'arrays must all be same length',
2926-
DataFrame, data)
2950+
with assertRaisesRegexp(ValueError, 'arrays must all be same length'):
2951+
DataFrame(data)
29272952

29282953
def test_constructor_scalar(self):
29292954
idx = Index(lrange(3))
@@ -12105,7 +12130,8 @@ def test_index_namedtuple(self):
1210512130
IndexType = namedtuple("IndexType", ["a", "b"])
1210612131
idx1 = IndexType("foo", "bar")
1210712132
idx2 = IndexType("baz", "bof")
12108-
index = Index([idx1, idx2], name="composite_index")
12133+
index = Index([idx1, idx2],
12134+
name="composite_index", tupleize_cols=False)
1210912135
df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"])
1211012136
self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1)
1211112137

0 commit comments

Comments
 (0)