Skip to content

Commit 0cc5616

Browse files
committed
ENH: parser API changes, added parse_dates options, address GH #225, #226
1 parent 99a4400 commit 0cc5616

File tree

7 files changed

+182
-103
lines changed

7 files changed

+182
-103
lines changed

RELEASE.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,32 @@ pandas 0.4.4
1010

1111
**Release date:** not yet released
1212

13+
**New features / modules**
14+
15+
- Added `parse_dates` option to `read_csv` and `read_table` methods to
16+
optionally try to parse dates in the index columns
17+
- Added ability to join on multiple columns in `DataFrame.join` (GH #214)
18+
19+
**API Changes**
20+
21+
- `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for
22+
`index_col` is now None. To use one or more of the columns as the resulting
23+
DataFrame's index, these must be explicitly specified now
24+
- Parsing functions no longer parse dates by default (GH #225)
25+
1326
**Improvements to existing features**
1427

1528
- Refactored merging / joining code into a tidy class and disabled unnecessary
1629
computations in the float/object case, thus getting about 10% better
1730
performance
31+
- Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about
32+
5x, regression from 0.3.0
33+
34+
**Bug fixes**
35+
36+
- Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should
37+
be reported upstream to matplotlib (GH #224)
38+
1839

1940
pandas 0.4.3
2041
============

pandas/core/common.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,9 @@ def _asarray_tuplesafe(values, dtype=None):
518518
if not isinstance(values, (list, tuple, np.ndarray)):
519519
values = list(values)
520520

521+
if isinstance(values, list) and dtype == np.object_:
522+
return lib.list_to_object_array(values)
523+
521524
result = np.asarray(values, dtype=dtype)
522525

523526
if issubclass(result.dtype.type, basestring):

pandas/core/frame.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,8 @@ def to_records(self, index=True):
430430
return np.rec.fromarrays(arrays, names=names)
431431

432432
@classmethod
433-
def from_csv(cls, path, header=0, delimiter=',', index_col=0):
433+
def from_csv(cls, path, header=0, delimiter=',', index_col=0,
434+
parse_dates=True):
434435
"""
435436
Read delimited file into DataFrame
436437
@@ -447,16 +448,15 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0):
447448
Notes
448449
-----
449450
Will attempt to convert index to datetimes for time series
450-
data. Use read_csv for more options
451+
data. Use read_table for more options
451452
452453
Returns
453454
-------
454455
y : DataFrame or DataFrame
455456
"""
456457
from pandas.io.parsers import read_table
457-
df = read_table(path, header=header, sep=delimiter,
458-
index_col=index_col)
459-
return df
458+
return read_table(path, header=header, sep=delimiter,
459+
parse_dates=parse_dates, index_col=index_col)
460460

461461
def to_sparse(self, fill_value=None, kind='block'):
462462
"""

pandas/io/parsers.py

Lines changed: 107 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -9,36 +9,10 @@
99
from pandas.core.index import Index, MultiIndex
1010
from pandas.core.frame import DataFrame
1111

12-
def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0,
13-
na_values=None, date_parser=None, names=None):
14-
"""
15-
Read CSV file into DataFrame
1612

17-
Parameters
18-
----------
19-
filepath_or_buffer : string or file handle / StringIO
20-
sep : string, default None
21-
Delimiter to use. By default will try to automatically determine
22-
this
23-
header : int, default 0
24-
Row to use for the column labels of the parsed DataFrame
25-
skiprows : list-like
26-
Row numbers to skip (0-indexed)
27-
index_col : int or sequence., default 0
28-
Column to use as the row labels of the DataFrame. Pass None if there is
29-
no such column. If a sequence is given, a MultiIndex is used.
30-
na_values : list-like, default None
31-
List of additional strings to recognize as NA/NaN
32-
date_parser : function
33-
Function to use for converting dates to strings. Defaults to
34-
dateutil.parser
35-
names : array-like
36-
List of column names
37-
38-
Returns
39-
-------
40-
parsed : DataFrame
41-
"""
13+
def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
14+
skiprows=None, na_values=None, parse_dates=False,
15+
date_parser=None):
4216
import csv
4317

4418
if hasattr(filepath_or_buffer, 'read'):
@@ -71,43 +45,77 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0,
7145
else:
7246
lines = [l for l in reader]
7347
f.close()
74-
return _simple_parser(lines, header=header, indexCol=index_col,
75-
colNames=names, na_values=na_values,
76-
date_parser=date_parser)
7748

78-
def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None,
79-
index_col=0, na_values=None, date_parser=None, names=None):
80-
"""
81-
Read delimited file into DataFrame
49+
if date_parser is not None:
50+
parse_dates = True
8251

83-
Parameters
84-
----------
85-
filepath_or_buffer : string or file handle
86-
sep : string, default '\t'
87-
Delimiter to use
88-
header : int, default 0
89-
Row to use for the column labels of the parsed DataFrame
90-
skiprows : list-like
91-
Row numbers to skip (0-indexed)
92-
index_col : int or sequence, default 0
93-
Column to use as the row labels of the DataFrame. Pass None if there is
94-
no such column. If a sequence is given, a MultiIndex is used.
95-
na_values : list-like, default None
96-
List of additional strings to recognize as NA/NaN
97-
date_parser : function
98-
Function to use for converting dates to strings. Defaults to
99-
dateutil.parser
100-
names : array-like
101-
List of column names
102-
103-
Returns
104-
-------
105-
parsed : DataFrame
106-
"""
107-
return read_csv(filepath_or_buffer, sep, header, skiprows,
108-
index_col, na_values, date_parser, names)
52+
return _simple_parser(lines,
53+
header=header,
54+
index_col=index_col,
55+
colNames=names,
56+
na_values=na_values,
57+
parse_dates=parse_dates,
58+
date_parser=date_parser)
10959

110-
def _simple_parser(lines, colNames=None, header=0, indexCol=0,
60+
61+
def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
62+
names=None, skiprows=None, na_values=None, parse_dates=False,
63+
date_parser=None):
64+
return read_csv(filepath_or_buffer, sep=sep, header=header,
65+
skiprows=skiprows, index_col=index_col,
66+
na_values=na_values, date_parser=date_parser,
67+
names=names, parse_dates=parse_dates)
68+
69+
_parser_params = """Parameters
70+
----------
71+
filepath_or_buffer : string or file handle / StringIO
72+
%s
73+
header : int, default 0
74+
Row to use for the column labels of the parsed DataFrame
75+
skiprows : list-like
76+
Row numbers to skip (0-indexed)
77+
index_col : int or sequence, default None
78+
Column to use as the row labels of the DataFrame. If a sequence is
79+
given, a MultiIndex is used.
80+
na_values : list-like, default None
81+
List of additional strings to recognize as NA/NaN
82+
parse_dates : boolean, default False
83+
Attempt to parse dates in the index column(s)
84+
date_parser : function
85+
Function to use for converting dates to strings. Defaults to
86+
dateutil.parser
87+
names : array-like
88+
List of column names"""
89+
90+
_csv_sep = """sep : string, default None
91+
Delimiter to use. By default will try to automatically determine
92+
this"""
93+
94+
_table_sep = """sep : string, default \\t (tab-stop)
95+
Delimiter to use"""
96+
97+
read_csv.__doc__ = """
98+
Read CSV (comma-separated) file into DataFrame
99+
100+
%s
101+
102+
Returns
103+
-------
104+
parsed : DataFrame
105+
""" % (_parser_params % _csv_sep)
106+
107+
read_table.__doc__ = """
108+
Read delimited file into DataFrame
109+
110+
%s
111+
112+
Returns
113+
-------
114+
parsed : DataFrame
115+
""" % (_parser_params % _table_sep)
116+
117+
118+
def _simple_parser(lines, colNames=None, header=0, index_col=0,
111119
na_values=None, date_parser=None, parse_dates=True):
112120
"""
113121
Workhorse function for processing nested list into DataFrame
@@ -142,30 +150,48 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0,
142150
zipped_content = zip(*content)
143151

144152
if len(content) == 0: # pragma: no cover
145-
raise Exception('No content to parse')
153+
if index_col is not None:
154+
if np.isscalar(index_col):
155+
index = Index([], name=columns.pop(index_col))
156+
else:
157+
cp_cols = list(columns)
158+
names = []
159+
for i in index_col:
160+
name = cp_cols[i]
161+
columns.remove(name)
162+
names.append(name)
163+
index = MultiIndex.fromarrays([[]] * len(index_col),
164+
names=names)
165+
else:
166+
index = Index([])
167+
168+
return DataFrame(index=index, columns=columns)
169+
170+
if index_col is None and len(content[0]) == len(columns) + 1:
171+
index_col = 0
146172

147173
# no index column specified, so infer that's what is wanted
148-
if indexCol is not None:
149-
if np.isscalar(indexCol):
150-
if indexCol == 0 and len(content[0]) == len(columns) + 1:
174+
if index_col is not None:
175+
if np.isscalar(index_col):
176+
if index_col == 0 and len(content[0]) == len(columns) + 1:
151177
index = zipped_content[0]
152178
zipped_content = zipped_content[1:]
153179
else:
154-
index = zipped_content.pop(indexCol)
155-
columns.pop(indexCol)
180+
index = zipped_content.pop(index_col)
181+
columns.pop(index_col)
156182
else: # given a list of index
157183
idx_names = []
158184
index = []
159-
for idx in indexCol:
185+
for idx in index_col:
160186
idx_names.append(columns[idx])
161187
index.append(zipped_content[idx])
162188
#remove index items from content and columns, don't pop in loop
163-
for i in range(len(indexCol)):
189+
for i in range(len(index_col)):
164190
columns.remove(idx_names[i])
165191
zipped_content.remove(index[i])
166192

167193

168-
if np.isscalar(indexCol):
194+
if np.isscalar(index_col):
169195
if parse_dates:
170196
index = _try_parse_dates(index, parser=date_parser)
171197
index = Index(_maybe_convert_int(np.array(index, dtype=object)))
@@ -232,9 +258,6 @@ def _maybe_convert_int(arr):
232258
return arr
233259

234260
def _maybe_convert_int_mindex(index, parse_dates, date_parser):
235-
if len(index) == 0:
236-
return index
237-
238261
for i in range(len(index)):
239262
try:
240263
int(index[i][0])
@@ -298,8 +321,8 @@ def __init__(self, path):
298321
def __repr__(self):
299322
return object.__repr__(self)
300323

301-
def parse(self, sheetname, header=0, skiprows=None, index_col=0,
302-
na_values=None):
324+
def parse(self, sheetname, header=0, skiprows=None, index_col=None,
325+
parse_dates=False, date_parser=None, na_values=None):
303326
"""
304327
Read Excel table into DataFrame
305328
@@ -348,7 +371,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=0,
348371
value = datetime(*dt)
349372
row.append(value)
350373
data.append(row)
351-
return _simple_parser(data, header=header, indexCol=index_col,
374+
return _simple_parser(data, header=header, index_col=index_col,
375+
parse_dates=parse_dates, date_parser=date_parser,
352376
na_values=na_values)
353377

354378
#-------------------------------------------------------------------------------
@@ -363,7 +387,8 @@ def parseCSV(filepath, header=0, skiprows=None, indexCol=0,
363387
"""
364388
warnings.warn("parseCSV is deprecated. Use read_csv instead", FutureWarning)
365389
return read_csv(filepath, header=header, skiprows=skiprows,
366-
index_col=indexCol, na_values=na_values)
390+
index_col=indexCol, na_values=na_values,
391+
parse_dates=True)
367392

368393
def parseText(filepath, sep='\t', header=0,
369394
indexCol=0, colNames=None): # pragma: no cover
@@ -374,7 +399,7 @@ def parseText(filepath, sep='\t', header=0,
374399
warnings.warn("parseText is deprecated. Use read_table instead",
375400
FutureWarning)
376401
return read_table(filepath, sep=sep, header=header, index_col=indexCol,
377-
names=colNames)
402+
names=colNames, parse_dates=True)
378403

379404

380405
def parseExcel(filepath, header=None, indexCol=0,
@@ -385,6 +410,7 @@ def parseExcel(filepath, header=None, indexCol=0,
385410
warnings.warn("parseExcel is deprecated. Use the ExcelFile class instead",
386411
FutureWarning)
387412
excel_file = ExcelFile(filepath)
388-
return excel_file.parse(sheetname, header=header, index_col=indexCol)
413+
return excel_file.parse(sheetname, header=header, index_col=indexCol,
414+
parse_dates=True)
389415

390416

0 commit comments

Comments
 (0)