diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index a2818c85ffa33..d9e78b197de7a 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -99,7 +99,7 @@ jobs: run: python ci/print_skipped.py - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v2 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index a132b1486ae12..36d410c3f5c4e 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -92,7 +92,7 @@ jobs: run: python ci/print_skipped.py - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v2 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 85ef356e7fa14..94ab2b568b636 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -37,7 +37,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - pip install git+https://github.com/numpy/numpy.git + pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy pip install git+https://github.com/pytest-dev/pytest.git pip install git+https://github.com/nedbat/coveragepy.git pip install cython python-dateutil pytz hypothesis pytest-xdist pytest-cov @@ -74,7 +74,7 @@ jobs: coverage report -m - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v2 with: flags: unittests name: codecov-pandas diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d580fcf4fc545..e2e6fe51b5f8b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -110,7 +110,7 @@ repos: entry: python scripts/generate_pip_deps_from_conda.py files: ^(environment.yml|requirements-dev.txt)$ pass_filenames: false - additional_dependencies: [pyyaml] + additional_dependencies: [pyyaml, toml] - id: sync-flake8-versions name: Check flake8 version is synced across flake8, yesqa, and environment.yml language: python diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index cfdcf266236e6..9d680cb8338fd 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -15,7 +15,7 @@ dependencies: - beautifulsoup4 - botocore>=1.11 - dask - - fastparquet>=0.4.0, < 0.7.0 + - fastparquet>=0.4.0 - fsspec>=0.7.4, <2021.6.0 - gcsfs>=0.6.0 - geopandas @@ -25,7 +25,7 @@ dependencies: - flask - nomkl - numexpr - - numpy=1.17.* + - numpy=1.18.* - odfpy - openpyxl - pandas-gbq diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml index 34a6860936550..dfed0df77a327 100644 --- a/ci/deps/actions-38-locale.yaml +++ b/ci/deps/actions-38-locale.yaml @@ -18,7 +18,7 @@ dependencies: - html5lib - ipython - jinja2 - - jedi<0.18.0 + - jedi - lxml - matplotlib<3.3.0 - moto diff --git a/ci/deps/actions-38-numpydev.yaml b/ci/deps/actions-38-numpydev.yaml index 6eed2daac0c3b..e943053f15600 100644 --- a/ci/deps/actions-38-numpydev.yaml +++ b/ci/deps/actions-38-numpydev.yaml @@ -11,11 +11,11 @@ dependencies: - hypothesis>=3.58.0 # pandas dependencies + - python-dateutil - pytz - pip - pip: - cython==0.29.21 # GH#34014 - - "git+git://github.com/dateutil/dateutil.git" - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - "--pre" - "numpy" diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 902daf102ccda..70aa46e8a5851 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.4.0, <0.7.0 + - fastparquet>=0.4.0 - flask - fsspec>=0.8.0, <2021.6.0 - matplotlib=3.1.3 diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index bc0a3556b9ac1..7b52bf760b601 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -189,11 +189,8 @@ Creating a Python environment (pip) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you aren't using conda for your development environment, follow these instructions. -You'll need to have at least the :ref:`minimum Python version ` that pandas supports. If your Python version -is 3.8.0 (or later), you might need to update your ``setuptools`` to version 42.0.0 (or later) -in your development environment before installing the build dependencies:: - - pip install --upgrade setuptools +You'll need to have at least the :ref:`minimum Python version ` that pandas supports. +You also need to have ``setuptools`` 51.0.0 or later to build pandas. **Unix**/**macOS with virtualenv** diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 08f46a53cf2f1..1978913da4073 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.3 .. toctree:: :maxdepth: 2 + v1.3.2 v1.3.1 v1.3.0 diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 0297aeecf01a6..a57995eb0db9a 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -48,4 +48,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v1.3.0..v1.3.1|HEAD +.. contributors:: v1.3.0..v1.3.1 diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst new file mode 100644 index 0000000000000..97803856c3d94 --- /dev/null +++ b/doc/source/whatsnew/v1.3.2.rst @@ -0,0 +1,50 @@ +.. _whatsnew_132: + +What's new in 1.3.2 (August 15, 2021) +------------------------------------- + +These are the changes in pandas 1.3.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_132.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`) +- Regression in updating values of :class:`Series` using boolean index, created by using :meth:`DataFrame.pop` (:issue:`42530`) +- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`) +- Fixed regression in :meth:`DataFrame.shift` where ``TypeError`` occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`) +- Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`) +- Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`) +- Fixed regression where :func:`read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to ``None`` (:issue:`42387`) +- Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`) +- Fixed regression in :meth:`.Styler.highlight_min` and :meth:`.Styler.highlight_max` where ``pandas.NA`` was not successfully ignored (:issue:`42650`) +- Fixed regression in :func:`concat` where ``copy=False`` was not honored in ``axis=1`` Series concatenation (:issue:`42501`) +- Regression in :meth:`Series.nlargest` and :meth:`Series.nsmallest` with nullable integer or float dtype (:issue:`42816`) +- Fixed regression in :meth:`Series.quantile` with :class:`Int64Dtype` (:issue:`42626`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_132.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :func:`read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`) +- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`) +- Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and ``window`` is an offset that covers all the rows (:issue:`42753`) +- :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`) +- :meth:`.Styler.set_sticky` has amended CSS to control the column/index names and ensure the correct sticky positions (:issue:`42537`) +- Bug in de-serializing datetime indexes in PYTHONOPTIMIZED mode (:issue:`42866`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_132.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.3.1..v1.3.2|HEAD diff --git a/environment.yml b/environment.yml index 500b8148a94d8..e75e56238205b 100644 --- a/environment.yml +++ b/environment.yml @@ -99,7 +99,7 @@ dependencies: - xlwt - odfpy - - fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow @@ -109,7 +109,7 @@ dependencies: - fsspec>=0.7.4, <2021.6.0 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - - xarray # DataFrame.to_xarray + - xarray<0.19 # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 88a2706a35aa2..911cf07862acc 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -270,9 +270,9 @@ cdef class _Timestamp(ABCTimestamp): if op == Py_EQ: return False if op == Py_LE or op == Py_LT: - return other.year <= self.year + return self.year <= other.year if op == Py_GE or op == Py_GT: - return other.year >= self.year + return self.year >= other.year cdef bint _can_compare(self, datetime other): if self.tzinfo is not None: diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index d188770576e05..197345b3ce6ac 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -79,12 +79,11 @@ def calculate_variable_window_bounds( else: end[0] = 0 if center: - for j in range(0, num_values + 1): - if (index[j] == index[0] + index_growth_sign * window_size / 2 and - right_closed): + end_bound = index[0] + index_growth_sign * window_size / 2 + for j in range(0, num_values): + if (index[j] < end_bound) or (index[j] == end_bound and right_closed): end[0] = j + 1 - break - elif index[j] >= index[0] + index_growth_sign * window_size / 2: + elif index[j] >= end_bound: end[0] = j break diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a9ca39b89360c..3ab0350f23c5a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -27,6 +27,7 @@ DtypeObj, FrameOrSeriesUnion, Scalar, + final, ) from pandas.util._decorators import doc @@ -1215,12 +1216,15 @@ def __init__(self, obj, n: int, keep: str): def compute(self, method: str) -> FrameOrSeriesUnion: raise NotImplementedError + @final def nlargest(self): return self.compute("nlargest") + @final def nsmallest(self): return self.compute("nsmallest") + @final @staticmethod def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: """ @@ -1259,6 +1263,18 @@ def compute(self, method: str) -> Series: dropped = self.obj.dropna() + if is_extension_array_dtype(dropped.dtype): + # GH#41816 bc we have dropped NAs above, MaskedArrays can use the + # numpy logic. + from pandas.core.arrays import BaseMaskedArray + + arr = dropped._values + if isinstance(arr, BaseMaskedArray): + ser = type(dropped)(arr._data, index=dropped.index, name=dropped.name) + + result = type(self)(ser, n=self.n, keep=self.keep).compute(method) + return result.astype(arr.dtype) + # slow method if n >= len(self.obj): ascending = method == "nsmallest" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 69e2650a15f16..aae3262893da2 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -691,21 +691,28 @@ def agg(self): obj = self.obj axis = self.axis + # TODO: Avoid having to change state + self.obj = self.obj if self.axis == 0 else self.obj.T + self.axis = 0 + + result = None + try: + result = super().agg() + except TypeError as err: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {err}" + ) + raise exc from err + finally: + self.obj = obj + self.axis = axis + if axis == 1: - result = FrameRowApply( - obj.T, - self.orig_f, - self.raw, - self.result_type, - self.args, - self.kwargs, - ).agg() result = result.T if result is not None else result - else: - result = super().agg() if result is None: - result = obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) + result = self.obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) return result diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 32c50ed38eba0..c5e96f32e261f 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -181,5 +181,10 @@ def _quantile_ea_fallback( assert res.ndim == 2 assert res.shape[0] == 1 res = res[0] - out = type(values)._from_sequence(res, dtype=values.dtype) + try: + out = type(values)._from_sequence(res, dtype=values.dtype) + except TypeError: + # GH#42626: not able to safely cast Int64 + # for floating point output + out = np.atleast_2d(np.asarray(res, dtype=np.float64)) return out diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3a152bd5889b7..1d78a74db98f0 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -417,7 +417,7 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion result[self._mask] = values_have_NA - mask = np.zeros_like(self, dtype=bool) + mask = np.zeros(self._data.shape, dtype=bool) return BooleanArray(result, mask, copy=False) def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index da4feb9640626..b92cf9751e4cf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -96,7 +96,10 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.inference import ( + is_hashable, + is_nested_list_like, +) from pandas.core.dtypes.missing import ( isna, notna, @@ -4184,6 +4187,7 @@ def _drop_axis( # Case for non-unique axis else: + is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple) labels = ensure_object(com.index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): @@ -4193,9 +4197,14 @@ def _drop_axis( # GH 18561 MultiIndex.drop should raise if label is absent if errors == "raise" and indexer.all(): raise KeyError(f"{labels} not found in axis") - elif isinstance(axis, MultiIndex) and labels.dtype == "object": + elif ( + isinstance(axis, MultiIndex) + and labels.dtype == "object" + and not is_tuple_labels + ): # Set level to zero in case of MultiIndex and label is string, # because isin can't handle strings for MultiIndexes GH#36293 + # In case of tuples we get dtype object but have to use isin GH#42771 indexer = ~axis.get_level_values(0).isin(labels) else: indexer = ~axis.isin(labels) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index fbfee9a1f524c..ff9a273267306 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -92,7 +92,8 @@ def _new_DatetimeIndex(cls, d): # These are already stored in our DatetimeArray; if they are # also in the pickle and don't match, we have a problem. if key in d: - assert d.pop(key) == getattr(dta, key) + assert d[key] == getattr(dta, key) + d.pop(key) result = cls._simple_new(dta, **d) else: with warnings.catch_warnings(): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6275fe39558a3..78ebfad4e3ae9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1557,6 +1557,16 @@ def _slice(self, slicer): return self.values[slicer] + @final + def getitem_block_index(self, slicer: slice) -> ExtensionBlock: + """ + Perform __getitem__-like specialized to slicing along index. + """ + # GH#42787 in principle this is equivalent to values[..., slicer], but we don't + # require subclasses of ExtensionArray to support that form (for now). + new_values = self.values[slicer] + return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + def fillna( self, value, limit=None, inplace: bool = False, downcast=None ) -> list[Block]: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5e327adfb8905..8987dd5adf5d3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -761,6 +761,14 @@ def to_arrays( # i.e. numpy structured array columns = ensure_index(data.dtype.names) arrays = [data[name] for name in columns] + + if len(data) == 0: + # GH#42456 the indexing above results in list of 2D ndarrays + # TODO: is that an issue with numpy? + for i, arr in enumerate(arrays): + if arr.ndim == 2: + arrays[i] = arr[:, 0] + return arrays, columns return [], ensure_index([]) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1c4fb98ee16bb..ea22cdaa89299 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -386,10 +386,14 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T: # We only get here with fill_value not-lib.no_default ncols = self.shape[0] if periods > 0: - indexer = [-1] * periods + list(range(ncols - periods)) + indexer = np.array( + [-1] * periods + list(range(ncols - periods)), dtype=np.intp + ) else: nper = abs(periods) - indexer = list(range(nper, ncols)) + [-1] * nper + indexer = np.array( + list(range(nper, ncols)) + [-1] * nper, dtype=np.intp + ) result = self.reindex_indexer( self.items, indexer, @@ -1184,8 +1188,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: warnings.warn( "DataFrame is highly fragmented. This is usually the result " "of calling `frame.insert` many times, which has poor performance. " - "Consider using pd.concat instead. To get a de-fragmented frame, " - "use `newframe = frame.copy()`", + "Consider joining all columns at once using pd.concat(axis=1) " + "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", PerformanceWarning, stacklevel=5, ) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index b49622f4ac36a..3f524a2da0011 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -505,7 +505,7 @@ def get_result(self): cons = sample._constructor_expanddim index, columns = self.new_axes - df = cons(data, index=index) + df = cons(data, index=index, copy=self.copy) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/core/series.py b/pandas/core/series.py index 2c191a2cc9a68..ddbf4350a63c1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1228,14 +1228,15 @@ def _maybe_update_cacher( # a copy if ref is None: del self._cacher + elif len(self) == len(ref) and self.name in ref.columns: + # GH#42530 self.name must be in ref.columns + # to ensure column still in dataframe + # otherwise, either self or ref has swapped in new arrays + ref._maybe_cache_changed(cacher[0], self) else: - if len(self) == len(ref): - # otherwise, either self or ref has swapped in new arrays - ref._maybe_cache_changed(cacher[0], self) - else: - # GH#33675 we have swapped in a new array, so parent - # reference to self is now invalid - ref._item_cache.pop(cacher[0], None) + # GH#33675 we have swapped in a new array, so parent + # reference to self is now invalid + ref._item_cache.pop(cacher[0], None) super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 660ca9c58a5b0..7ab2b9dec20c8 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1473,24 +1473,24 @@ def set_sticky( may produce strange behaviour due to CSS controls with missing elements. """ if axis in [0, "index"]: - axis, obj, tag, pos = 0, self.data.index, "tbody", "left" + axis, obj = 0, self.data.index pixel_size = 75 if not pixel_size else pixel_size elif axis in [1, "columns"]: - axis, obj, tag, pos = 1, self.data.columns, "thead", "top" + axis, obj = 1, self.data.columns pixel_size = 25 if not pixel_size else pixel_size else: raise ValueError("`axis` must be one of {0, 1, 'index', 'columns'}") + props = "position:sticky; background-color:white;" if not isinstance(obj, pd.MultiIndex): # handling MultiIndexes requires different CSS - props = "position:sticky; background-color:white;" if axis == 1: # stick the first of and, if index names, the second # if self._hide_columns then no here will exist: no conflict styles: CSSStyles = [ { - "selector": "thead tr:first-child", + "selector": "thead tr:nth-child(1) th", "props": props + "top:0px; z-index:2;", } ] @@ -1500,7 +1500,7 @@ def set_sticky( ) styles.append( { - "selector": "thead tr:nth-child(2)", + "selector": "thead tr:nth-child(2) th", "props": props + f"top:{pixel_size}px; z-index:2; height:{pixel_size}px; ", } @@ -1511,34 +1511,67 @@ def set_sticky( # but will exist in : conflict with initial element styles = [ { - "selector": "tr th:first-child", + "selector": "thead tr th:nth-child(1)", + "props": props + "left:0px; z-index:3 !important;", + }, + { + "selector": "tbody tr th:nth-child(1)", "props": props + "left:0px; z-index:1;", - } + }, ] - return self.set_table_styles(styles, overwrite=False) - else: + # handle the MultiIndex case range_idx = list(range(obj.nlevels)) + levels = sorted(levels) if levels else range_idx - levels = sorted(levels) if levels else range_idx - for i, level in enumerate(levels): - self.set_table_styles( - [ - { - "selector": f"{tag} th.level{level}", - "props": f"position: sticky; " - f"{pos}: {i * pixel_size}px; " - f"{f'height: {pixel_size}px; ' if axis == 1 else ''}" - f"{f'min-width: {pixel_size}px; ' if axis == 0 else ''}" - f"{f'max-width: {pixel_size}px; ' if axis == 0 else ''}" - f"background-color: white;", - } - ], - overwrite=False, - ) + if axis == 1: + styles = [] + for i, level in enumerate(levels): + styles.append( + { + "selector": f"thead tr:nth-child({level+1}) th", + "props": props + + ( + f"top:{i * pixel_size}px; height:{pixel_size}px; " + "z-index:2;" + ), + } + ) + if not all(name is None for name in self.index.names): + styles.append( + { + "selector": f"thead tr:nth-child({obj.nlevels+1}) th", + "props": props + + ( + f"top:{(i+1) * pixel_size}px; height:{pixel_size}px; " + "z-index:2;" + ), + } + ) - return self + else: + styles = [] + for i, level in enumerate(levels): + props_ = props + ( + f"left:{i * pixel_size}px; " + f"min-width:{pixel_size}px; " + f"max-width:{pixel_size}px; " + ) + styles.extend( + [ + { + "selector": f"thead tr th:nth-child({level+1})", + "props": props_ + "z-index:3 !important;", + }, + { + "selector": f"tbody tr th.level{level}", + "props": props_ + "z-index:1;", + }, + ] + ) + + return self.set_table_styles(styles, overwrite=False) def set_table_styles( self, @@ -2312,15 +2345,15 @@ def highlight_max( Styler.highlight_quantile: Highlight values defined by a quantile with a style. """ - def f(data: FrameOrSeries, props: str) -> np.ndarray: - return np.where(data == np.nanmax(data.to_numpy()), props, "") - if props is None: props = f"background-color: {color};" # error: Argument 1 to "apply" of "Styler" has incompatible type # "Callable[[FrameOrSeries, str], ndarray]"; expected "Callable[..., Styler]" return self.apply( - f, axis=axis, subset=subset, props=props # type: ignore[arg-type] + partial(_highlight_value, op="max"), # type: ignore[arg-type] + axis=axis, + subset=subset, + props=props, ) def highlight_min( @@ -2363,15 +2396,15 @@ def highlight_min( Styler.highlight_quantile: Highlight values defined by a quantile with a style. """ - def f(data: FrameOrSeries, props: str) -> np.ndarray: - return np.where(data == np.nanmin(data.to_numpy()), props, "") - if props is None: props = f"background-color: {color};" # error: Argument 1 to "apply" of "Styler" has incompatible type # "Callable[[FrameOrSeries, str], ndarray]"; expected "Callable[..., Styler]" return self.apply( - f, axis=axis, subset=subset, props=props # type: ignore[arg-type] + partial(_highlight_value, op="min"), # type: ignore[arg-type] + axis=axis, + subset=subset, + props=props, ) def highlight_between( @@ -2870,3 +2903,13 @@ def _highlight_between( else np.full(data.shape, True, dtype=bool) ) return np.where(g_left & l_right, props, "") + + +def _highlight_value(data: FrameOrSeries, op: str, props: str) -> np.ndarray: + """ + Return an array of css strings based on the condition of values matching an op. + """ + value = getattr(data, op)(skipna=True) + if isinstance(data, DataFrame): # min/max must be done twice to return scalar + value = getattr(value, op)(skipna=True) + return np.where(data == value, props, "") diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 616b89f9e519f..97a05329507c9 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -356,6 +356,7 @@ def _translate_header( self.data.index.names and com.any_not_none(*self.data.index.names) and not self.hide_index_ + and not self.hide_columns_ ): index_names = [ _element( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index b7523fada07d0..5671cce1b966d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,14 +309,17 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + if Version(self.api.__version__) >= Version("0.7.0"): + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs["pandas_nulls"] = False if use_nullable_dtypes: raise ValueError( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine" ) path = stringify_path(path) - parquet_kwargs = {} handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") @@ -337,6 +340,7 @@ def read( path, "rb", is_text=False, storage_options=storage_options ) path = handles.handle + parquet_file = self.api.ParquetFile(path, **parquet_kwargs) result = parquet_file.to_pandas(columns=columns, **kwargs) @@ -470,7 +474,8 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + for the resulting DataFrame. (only applicable for the ``pyarrow`` + engine) As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 13f2d62399418..42e9db75847eb 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -4,6 +4,7 @@ abc, defaultdict, ) +from copy import copy import csv from io import StringIO import re @@ -81,7 +82,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): self.verbose = kwds["verbose"] self.converters = kwds["converters"] - self.dtype = kwds["dtype"] + self.dtype = copy(kwds["dtype"]) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -432,7 +433,6 @@ def _infer_columns(self): and self.dtype.get(col) is None ): self.dtype.update({col: self.dtype.get(old_col)}) - this_columns[i] = col counts[col] = cur_count + 1 elif have_mi_columns: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 06bdbe3054a15..c639a4a9d494e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1302,7 +1302,12 @@ def _refine_defaults_read( if delimiter and (sep is not lib.no_default): raise ValueError("Specified a sep and a delimiter; you can only specify one.") - if names is not lib.no_default and prefix is not lib.no_default: + if ( + names is not None + and names is not lib.no_default + and prefix is not None + and prefix is not lib.no_default + ): raise ValueError("Specified named and prefix; you can only specify one.") kwds["names"] = None if names is lib.no_default else names diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 9bfa24b6371ab..ebe336bd55c97 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -414,8 +414,12 @@ def handle_shared_axes( except IndexError: # if gridspec is used, ax.rowNum and ax.colNum may different # from layout shape. in this case, use last_row logic + if compat.mpl_ge_3_4_0(): + is_last_row = lambda x: x.get_subplotspec().is_last_row() + else: + is_last_row = lambda x: x.is_last_row() for ax in axarr: - if ax.is_last_row(): + if is_last_row(ax): continue if sharex or _has_externally_shared_axis(ax, "x"): _remove_labels_from_axis(ax.xaxis) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 995f404dc49d3..c34447764b311 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -671,13 +671,14 @@ def test_apply_dup_names_multi_agg(): tm.assert_frame_equal(result, expected) -def test_apply_nested_result_axis_1(): +@pytest.mark.parametrize("op", ["apply", "agg"]) +def test_apply_nested_result_axis_1(op): # GH 13820 def apply_list(row): return [2 * row["A"], 2 * row["C"], 2 * row["B"]] df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) - result = df.apply(apply_list, axis=1) + result = getattr(df, op)(apply_list, axis=1) expected = Series( [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] ) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 96833a2e49fa1..ac181af7875b5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -425,3 +425,23 @@ def test_item(self, data): with pytest.raises(ValueError, match=msg): s.item() + + def test_ellipsis_index(self): + # GH42430 1D slices over extension types turn into N-dimensional slices over + # ExtensionArrays + class CapturingStringArray(pd.arrays.StringArray): + """Extend StringArray to capture arguments to __getitem__""" + + def __getitem__(self, item): + self.last_item_arg = item + return super().__getitem__(item) + + df = pd.DataFrame( + {"col1": CapturingStringArray(np.array(["hello", "world"], dtype=object))} + ) + _ = df.iloc[:1] + + # String comparison because there's no native way to compare slices. + # Before the fix for GH42430, last_item_arg would get set to the 2D slice + # (Ellipsis, slice(None, 1, None)) + self.assert_equal(str(df["col1"].array.last_item_arg), "slice(None, 1, None)") diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7a3f88d0d6c41..99d92a5bbf774 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -261,18 +261,7 @@ def test_dataframe_constructor_with_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "frame", - [ - pytest.param( - True, - marks=pytest.mark.xfail( - reason="pd.concat call inside NDFrame.astype reverts the dtype" - ), - ), - False, - ], -) +@pytest.mark.parametrize("frame", [True, False]) def test_astype_dispatches(frame): # This is a dtype-specific test that ensures Series[decimal].astype # gets all the way through to ExtensionArray.astype diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 35ad9f3e9693b..2322d5f995964 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -457,3 +457,16 @@ def test_from_records_empty_with_nonempty_fields_gh3682(self): b = a[:0] df2 = DataFrame.from_records(b, index="id") tm.assert_frame_equal(df2, df.iloc[:0]) + + def test_from_records_empty2(self): + # GH#42456 + dtype = [("prop", int)] + shape = (0, len(dtype)) + arr = np.empty(shape, dtype=dtype) + + result = DataFrame.from_records(arr) + expected = DataFrame({"prop": np.array([], dtype=int)}) + tm.assert_frame_equal(result, expected) + + alt = DataFrame(arr) + tm.assert_frame_equal(alt, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1f1991214aad0..775a5a38768e6 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -23,6 +23,7 @@ option_context, ) import pandas._testing as tm +from pandas.core.arrays.integer import coerce_to_array def _check_cast(df, v): @@ -726,3 +727,32 @@ def test_astype_categorical_to_string_missing(self): cat = df.astype("category") result = cat.astype(str) tm.assert_frame_equal(result, expected) + + +class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): + # GH 42501 + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) + return IntegerArrayNoCopy(values, mask) + + def copy(self): + assert False + + +class Int16DtypeNoCopy(pd.Int16Dtype): + # GH 42501 + + @classmethod + def construct_array_type(cls): + return IntegerArrayNoCopy + + +def test_frame_astype_no_copy(): + # GH 42501 + df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object) + result = df.astype({"a": Int16DtypeNoCopy()}, copy=False) + + assert result.a.dtype == pd.Int16Dtype() + assert np.shares_memory(df.b.values, result.b.values) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index b3eeab9db4ad5..87139b3a1f8b9 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -456,6 +456,17 @@ def test_drop_with_non_unique_multiindex(self): expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]])) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]]) + def test_drop_tuple_with_non_unique_multiindex(self, indexer): + # GH#42771 + idx = MultiIndex.from_product([["a", "b"], ["a", "a"]]) + df = DataFrame({"x": range(len(idx))}, index=idx) + result = df.drop(index=[("a", "a")]) + expected = DataFrame( + {"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")]) + ) + tm.assert_frame_equal(result, expected) + def test_drop_with_duplicate_columns(self): df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 0474206aec06f..9df5f79aa7d19 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -183,6 +183,32 @@ def test_shift_axis1_multiple_blocks(self, using_array_manager): tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support + def test_shift_axis1_multiple_blocks_with_int_fill(self): + # GH#42719 + df1 = DataFrame(np.random.randint(1000, size=(5, 3))) + df2 = DataFrame(np.random.randint(1000, size=(5, 2))) + df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) + result = df3.shift(2, axis=1, fill_value=np.int_(0)) + assert len(df3._mgr.blocks) == 2 + + expected = df3.take([-1, -1, 0, 1], axis=1) + expected.iloc[:, :2] = np.int_(0) + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) + result = df3.shift(-2, axis=1, fill_value=np.int_(0)) + assert len(df3._mgr.blocks) == 2 + + expected = df3.take([2, 3, -1, -1], axis=1) + expected.iloc[:, -2:] = np.int_(0) + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_frame): # TODO: remove this test when tshift deprecation is enforced diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index d40fb3ce4a135..1f081ee2dbe33 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -576,8 +576,12 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): # GH#35211 basename = "df_mangle_dup_col_dtypes" - result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes}) + dtype_dict = {"a": str, **dtypes} + dtype_dict_copy = dtype_dict.copy() + # GH#42462 + result = pd.read_excel(basename + read_ext, dtype=dtype_dict) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) def test_reader_spaces(self, read_ext): diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py index a681d7c65a190..1b579a43370a2 100644 --- a/pandas/tests/io/formats/style/test_highlight.py +++ b/pandas/tests/io/formats/style/test_highlight.py @@ -2,10 +2,10 @@ import pytest from pandas import ( + NA, DataFrame, IndexSlice, ) -import pandas._testing as tm pytest.importorskip("jinja2") @@ -55,9 +55,7 @@ def test_highlight_minmax_basic(df, f): } if f == "highlight_min": df = -df - with tm.assert_produces_warning(RuntimeWarning): - # All-NaN slice encountered - result = getattr(df.style, f)(axis=1, color="red")._compute().ctx + result = getattr(df.style, f)(axis=1, color="red")._compute().ctx assert result == expected @@ -78,6 +76,26 @@ def test_highlight_minmax_ext(df, f, kwargs): assert result == expected +@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"]) +@pytest.mark.parametrize("axis", [None, 0, 1]) +def test_highlight_minmax_nulls(f, axis): + # GH 42750 + expected = { + (1, 0): [("background-color", "yellow")], + (1, 1): [("background-color", "yellow")], + } + if axis == 1: + expected.update({(2, 1): [("background-color", "yellow")]}) + + if f == "highlight_max": + df = DataFrame({"a": [NA, 1, None], "b": [np.nan, 1, -1]}) + else: + df = DataFrame({"a": [NA, -1, None], "b": [np.nan, -1, 1]}) + + result = getattr(df.style, f)(axis=axis)._compute().ctx + assert result == expected + + @pytest.mark.parametrize( "kwargs", [ diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 4e71cb4c46626..4e1db69177534 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -281,26 +281,32 @@ def test_sticky_basic(styler, index, columns, index_name): if columns: styler.set_sticky(axis=1) - res = styler.set_uuid("").to_html() - - css_for_index = ( - "tr th:first-child {\n position: sticky;\n background-color: white;\n " - "left: 0px;\n z-index: 1;\n}" + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " left: 0px;\n z-index: {1};\n}}" ) - assert (css_for_index in res) is index - - css_for_cols_1 = ( - "thead tr:first-child {\n position: sticky;\n background-color: white;\n " - "top: 0px;\n z-index: 2;\n" + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " top: {1}px;\n z-index: {2};\n{3}}}" ) - css_for_cols_1 += " height: 25px;\n}" if index_name else "}" - assert (css_for_cols_1 in res) is columns - css_for_cols_2 = ( - "thead tr:nth-child(2) {\n position: sticky;\n background-color: white;\n " - "top: 25px;\n z-index: 2;\n height: 25px;\n}" + res = styler.set_uuid("").to_html() + + # test index stickys over thead and tbody + assert (left_css.format("thead tr th:nth-child(1)", "3 !important") in res) is index + assert (left_css.format("tbody tr th:nth-child(1)", "1") in res) is index + + # test column stickys including if name row + assert ( + top_css.format("thead tr:nth-child(1) th", "0", "2", " height: 25px;\n") in res + ) is (columns and index_name) + assert ( + top_css.format("thead tr:nth-child(2) th", "25", "2", " height: 25px;\n") + in res + ) is (columns and index_name) + assert (top_css.format("thead tr:nth-child(1) th", "0", "2", "") in res) is ( + columns and not index_name ) - assert (css_for_cols_2 in res) is (index_name and columns) @pytest.mark.parametrize("index", [False, True]) @@ -311,73 +317,30 @@ def test_sticky_mi(styler_mi, index, columns): if columns: styler_mi.set_sticky(axis=1) - res = styler_mi.set_uuid("").to_html() - assert ( - ( - dedent( - """\ - #T_ tbody th.level0 { - position: sticky; - left: 0px; - min-width: 75px; - max-width: 75px; - background-color: white; - } - """ - ) - in res - ) - is index + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " left: {1}px;\n min-width: 75px;\n max-width: 75px;\n z-index: {2};\n}}" ) - assert ( - ( - dedent( - """\ - #T_ tbody th.level1 { - position: sticky; - left: 75px; - min-width: 75px; - max-width: 75px; - background-color: white; - } - """ - ) - in res - ) - is index + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " top: {1}px;\n height: 25px;\n z-index: {2};\n}}" ) + + res = styler_mi.set_uuid("").to_html() + + # test the index stickys for thead and tbody over both levels assert ( - ( - dedent( - """\ - #T_ thead th.level0 { - position: sticky; - top: 0px; - height: 25px; - background-color: white; - } - """ - ) - in res - ) - is columns - ) + left_css.format("thead tr th:nth-child(1)", "0", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level0", "0", "1") in res) is index assert ( - ( - dedent( - """\ - #T_ thead th.level1 { - position: sticky; - top: 25px; - height: 25px; - background-color: white; - } - """ - ) - in res - ) - is columns - ) + left_css.format("thead tr th:nth-child(2)", "75", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level1", "75", "1") in res) is index + + # test the column stickys for each level row + assert (top_css.format("thead tr:nth-child(1) th", "0", "2") in res) is columns + assert (top_css.format("thead tr:nth-child(2) th", "25", "2") in res) is columns @pytest.mark.parametrize("index", [False, True]) @@ -388,43 +351,29 @@ def test_sticky_levels(styler_mi, index, columns): if columns: styler_mi.set_sticky(axis=1, levels=[1]) - res = styler_mi.set_uuid("").to_html() - assert "#T_ tbody th.level0 {" not in res - assert "#T_ thead th.level0 {" not in res - assert ( - ( - dedent( - """\ - #T_ tbody th.level1 { - position: sticky; - left: 0px; - min-width: 75px; - max-width: 75px; - background-color: white; - } - """ - ) - in res - ) - is index + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " left: {1}px;\n min-width: 75px;\n max-width: 75px;\n z-index: {2};\n}}" ) - assert ( - ( - dedent( - """\ - #T_ thead th.level1 { - position: sticky; - top: 0px; - height: 25px; - background-color: white; - } - """ - ) - in res - ) - is columns + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " top: {1}px;\n height: 25px;\n z-index: {2};\n}}" ) + res = styler_mi.set_uuid("").to_html() + + # test no sticking of level0 + assert "#T_ thead tr th:nth-child(1)" not in res + assert "#T_ tbody tr th.level0" not in res + assert "#T_ thead tr:nth-child(1) th" not in res + + # test sticking level1 + assert ( + left_css.format("thead tr th:nth-child(2)", "0", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level1", "0", "1") in res) is index + assert (top_css.format("thead tr:nth-child(2) th", "0", "2") in res) is columns + def test_sticky_raises(styler): with pytest.raises(ValueError, match="`axis` must be"): diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index f2c2f673909d4..99c725ed8df69 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -1109,7 +1109,7 @@ def test_hide_column_headers(self): self.df.index.name = "some_name" ctx = self.df.style.hide_columns()._translate(True, True) - assert len(ctx["head"]) == 1 # only a single row for index names: no col heads + assert len(ctx["head"]) == 0 # no header for index names, changed in #42101 def test_hide_single_index(self): # GH 14194 diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index a1c76e2740dbe..b2e528aa5f8d5 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -764,15 +764,24 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): @pytest.mark.parametrize("func", ["read_csv", "read_table"]) -@pytest.mark.parametrize("prefix", [None, "x"]) -@pytest.mark.parametrize("names", [None, ["a"]]) -def test_names_and_prefix_not_lib_no_default(all_parsers, names, prefix, func): +def test_names_and_prefix_not_None_raises(all_parsers, func): # GH#39123 f = StringIO("a,b\n1,2") parser = all_parsers msg = "Specified named and prefix; you can only specify one." with pytest.raises(ValueError, match=msg): - getattr(parser, func)(f, names=names, prefix=prefix) + getattr(parser, func)(f, names=["a", "b"], prefix="x") + + +@pytest.mark.parametrize("func", ["read_csv", "read_table"]) +@pytest.mark.parametrize("prefix, names", [(None, ["x0", "x1"]), ("x", None)]) +def test_names_and_prefix_explicit_None(all_parsers, names, prefix, func): + # GH42387 + f = StringIO("a,b\n1,2") + expected = DataFrame({"x0": ["a", "1"], "x1": ["b", "2"]}) + parser = all_parsers + result = getattr(parser, func)(f, names=names, sep=",", prefix=prefix, header=None) + tm.assert_frame_equal(result, expected) def test_dict_keys_as_names(all_parsers): diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index bc20f1d1eea5f..0b59e877cd9a1 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -245,8 +245,12 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): # GH#35211 parser = all_parsers data = """a,a\n1,1""" - result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes}) + dtype_dict = {"a": str, **dtypes} + # GH#42462 + dtype_dict_copy = dtype_dict.copy() + result = parser.read_csv(StringIO(data), dtype=dtype_dict) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d100c584b698a..914f68b90ad37 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa): msg = r"parquet must have string column names" self.check_error_on_write(df, engine, ValueError, msg) + def test_use_nullable_dtypes(self, engine): + import pyarrow.parquet as pq + + if engine == "fastparquet": + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + pytest.skip("Fastparquet nullable dtype support is disabled") + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), + # Test that nullable dtypes used even in absence of nulls + "e": pyarrow.array([1, 2, 3, 4], "int64"), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path, engine=engine) + result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), + "e": pd.array([1, 2, 3, 4], dtype="Int64"), + } + ) + if engine == "fastparquet": + # Fastparquet doesn't support string columns yet + # Only int and boolean + result2 = result2.drop("c", axis=1) + expected = expected.drop("c", axis=1) + tm.assert_frame_equal(result2, expected) + @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning") class TestParquetPyArrow(Base): @@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow") - def test_use_nullable_dtypes(self, pa): - import pyarrow.parquet as pq - - table = pyarrow.table( - { - "a": pyarrow.array([1, 2, 3, None], "int64"), - "b": pyarrow.array([1, 2, 3, None], "uint8"), - "c": pyarrow.array(["a", "b", "c", None]), - "d": pyarrow.array([True, False, True, None]), - } - ) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path) - result2 = read_parquet(path, use_nullable_dtypes=True) - - assert result1["a"].dtype == np.dtype("float64") - expected = pd.DataFrame( - { - "a": pd.array([1, 2, 3, None], dtype="Int64"), - "b": pd.array([1, 2, 3, None], dtype="UInt8"), - "c": pd.array(["a", "b", "c", None], dtype="string"), - "d": pd.array([True, False, True, None], dtype="boolean"), - } - ) - tm.assert_frame_equal(result2, expected) - def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so # this should work without error @@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp): def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") - check_round_trip(df, fp, expected=expected) + # Fastparquet bug in 0.7.1 makes it so that this dtype becomes + # float64 + check_round_trip(df, fp, expected=expected, check_dtype=False) def test_unsupported(self, fp): @@ -1049,7 +1063,7 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected.index.name = "index" check_round_trip(df, fp, expected=expected) - def test_use_nullable_dtypes_not_supported(self, fp): + def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp): df = pd.DataFrame({"a": [1, 2]}) with tm.ensure_clean() as path: diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index 555067f2aba1a..ee36223eb2496 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -266,6 +266,19 @@ def test_timestamp_compare_oob_dt64(self): assert Timestamp.max < other + us # Note: numpy gets the reversed comparison wrong + # GH-42794 + other = datetime(9999, 9, 9) + assert Timestamp.min < other + assert other > Timestamp.min + assert Timestamp.max < other + assert other > Timestamp.max + + other = datetime(1, 1, 1) + assert Timestamp.max > other + assert other < Timestamp.max + assert Timestamp.min > other + assert other < Timestamp.min + def test_compare_zerodim_array(self): # GH#26916 ts = Timestamp.now() diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 13054062defb4..2ad3aee99d147 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,6 +8,7 @@ from pandas import ( Categorical, + DataFrame, DatetimeIndex, Index, MultiIndex, @@ -906,3 +907,17 @@ def val(self): def is_inplace(self, obj): # This is specific to the 4 cases currently implemented for this class. return obj.dtype.kind != "i" + + +def test_setitem_with_bool_indexer(): + # GH#42530 + + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.pop("b") + result[[True, False, False]] = 9 + expected = Series(data=[9, 5, 6], name="b") + tm.assert_series_equal(result, expected) + + df.loc[[True, False, False], "a"] = 10 + expected = DataFrame({"a": [10, 2, 3]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index e4803a9cd3038..620f529b522ae 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -128,6 +130,15 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) + def test_clip_with_timestamps_and_oob_datetimes(self): + # GH-42794 + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)]) + + result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) + expected = Series([Timestamp.min, Timestamp.max], dtype="object") + + tm.assert_series_equal(result, expected) + def test_clip_pos_args_deprecation(self): # https://github.com/pandas-dev/pandas/issues/41485 ser = Series([1, 2, 3]) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 3af06145b9fcd..0efb0663a0327 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -211,3 +211,19 @@ def test_nlargest_boolean(self, data, expected): result = ser.nlargest(1) expected = Series(expected) tm.assert_series_equal(result, expected) + + def test_nlargest_nullable(self, any_nullable_numeric_dtype): + # GH#42816 + dtype = any_nullable_numeric_dtype + arr = np.random.randn(10).astype(dtype.lower(), copy=False) + + ser = Series(arr.copy(), dtype=dtype) + ser[1] = pd.NA + result = ser.nlargest(5) + + expected = ( + Series(np.delete(arr, 1), index=ser.index.delete(1)) + .nlargest(5) + .astype(dtype) + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 461c81bc3b44f..84bfe8524634b 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -217,3 +217,9 @@ def test_quantile_empty(self): res = s.quantile([0.5]) exp = Series([pd.NaT], index=[0.5]) tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize("dtype", [int, float, "Int64"]) + def test_quantile_dtypes(self, dtype): + result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25)) + expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25)) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index ea95f90d3a2cb..e34ca9e7f9e27 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -69,6 +69,21 @@ def test_oo_optimizable(): subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) +def test_oo_optimized_datetime_index_unpickle(): + # GH 42866 + subprocess.check_call( + [ + sys.executable, + "-OO", + "-c", + ( + "import pandas as pd, pickle; " + "pickle.loads(pickle.dumps(pd.date_range('2021-01-01', periods=1)))" + ), + ] + ) + + @tm.network # Cython import warning @pytest.mark.filterwarnings("ignore:pandas.util.testing is deprecated") diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 77ca482936298..5bf2df0208ddc 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -220,6 +220,36 @@ def test_datetimelike_centered_selections( tm.assert_frame_equal(result, expected, check_dtype=False) +@pytest.mark.parametrize( + "window,closed,expected", + [ + ("3s", "right", [3.0, 3.0, 3.0]), + ("3s", "both", [3.0, 3.0, 3.0]), + ("3s", "left", [3.0, 3.0, 3.0]), + ("3s", "neither", [3.0, 3.0, 3.0]), + ("2s", "right", [3.0, 2.0, 2.0]), + ("2s", "both", [3.0, 3.0, 3.0]), + ("2s", "left", [1.0, 3.0, 3.0]), + ("2s", "neither", [1.0, 2.0, 2.0]), + ], +) +def test_datetimelike_centered_offset_covers_all( + window, closed, expected, frame_or_series +): + # GH 42753 + + index = [ + Timestamp("20130101 09:00:01"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:02"), + ] + df = frame_or_series([1, 1, 1], index=index) + + result = df.rolling(window, closed=closed, center=True).sum() + expected = frame_or_series(expected, index=index) + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) diff --git a/pyproject.toml b/pyproject.toml index 86b255ab6bf58..0d1aefb9d4b55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "setuptools>=38.6.0", + "setuptools>=51.0.0", "wheel", "Cython>=0.29.21,<3", # Note: sync with setup.py # Numpy requirements for different OS/architectures diff --git a/requirements-dev.txt b/requirements-dev.txt index 712c9a4b89273..3b40c9c300ace 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -64,7 +64,7 @@ xlrd xlsxwriter xlwt odfpy -fastparquet>=0.3.2, <0.7.0 +fastparquet>=0.3.2 pyarrow>=0.17.0 python-snappy pyqt5>=5.9.2 @@ -73,7 +73,7 @@ s3fs>=0.4.0 fsspec>=0.7.4, <2021.6.0 gcsfs>=0.6.0 sqlalchemy -xarray +xarray<0.19 cftime pyreadstat tabulate>=0.8.3 @@ -81,3 +81,4 @@ natsort git+https://github.com/pydata/pydata-sphinx-theme.git@master numpydoc < 1.2 pandas-dev-flaker==0.2.0 +setuptools>=51.0.0 diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 1ad9ec03925a0..ec4b6c81f764c 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -13,17 +13,18 @@ $ python scripts/generate_pip_deps_from_conda.py --compare """ import argparse -import os +import pathlib import re import sys +import toml import yaml EXCLUDE = {"python", "c-compiler", "cxx-compiler"} RENAME = {"pytables": "tables", "pyqt": "pyqt5", "dask-core": "dask"} -def conda_package_to_pip(package): +def conda_package_to_pip(package: str): """ Convert a conda package to its pip equivalent. @@ -36,17 +37,13 @@ def conda_package_to_pip(package): package = re.sub("(?<=[^<>])=", "==", package).strip() for compare in ("<=", ">=", "=="): - if compare not in package: - continue + if compare in package: + pkg, version = package.split(compare) + if pkg in EXCLUDE: + return - pkg, version = package.split(compare) - if pkg in EXCLUDE: - return - - if pkg in RENAME: - return "".join((RENAME[pkg], compare, version)) - - break + if pkg in RENAME: + return "".join((RENAME[pkg], compare, version)) if package in EXCLUDE: return @@ -57,16 +54,18 @@ def conda_package_to_pip(package): return package -def main(conda_fname, pip_fname, compare=False): +def generate_pip_from_conda( + conda_path: pathlib.Path, pip_path: pathlib.Path, compare: bool = False +) -> bool: """ Generate the pip dependencies file from the conda file, or compare that they are synchronized (``compare=True``). Parameters ---------- - conda_fname : str + conda_path : pathlib.Path Path to the conda file with dependencies (e.g. `environment.yml`). - pip_fname : str + pip_path : pathlib.Path Path to the pip file with dependencies (e.g. `requirements-dev.txt`). compare : bool, default False Whether to generate the pip file (``False``) or to compare if the @@ -78,8 +77,8 @@ def main(conda_fname, pip_fname, compare=False): bool True if the comparison fails, False otherwise """ - with open(conda_fname) as conda_fd: - deps = yaml.safe_load(conda_fd)["dependencies"] + with conda_path.open() as file: + deps = yaml.safe_load(file)["dependencies"] pip_deps = [] for dep in deps: @@ -88,24 +87,30 @@ def main(conda_fname, pip_fname, compare=False): if conda_dep: pip_deps.append(conda_dep) elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep: - pip_deps += dep["pip"] + pip_deps.extend(dep["pip"]) else: raise ValueError(f"Unexpected dependency {dep}") - fname = os.path.split(conda_fname)[1] header = ( - f"# This file is auto-generated from {fname}, do not modify.\n" + f"# This file is auto-generated from {conda_path.name}, do not modify.\n" "# See that file for comments about the need/usage of each dependency.\n\n" ) pip_content = header + "\n".join(pip_deps) + "\n" + # add setuptools to requirements-dev.txt + meta = toml.load(pathlib.Path(conda_path.parent, "pyproject.toml")) + for requirement in meta["build-system"]["requires"]: + if "setuptools" in requirement: + pip_content += requirement + pip_content += "\n" + if compare: - with open(pip_fname) as pip_fd: - return pip_content != pip_fd.read() - else: - with open(pip_fname, "w") as pip_fd: - pip_fd.write(pip_content) - return False + with pip_path.open() as file: + return pip_content != file.read() + + with pip_path.open("w") as file: + file.write(pip_content) + return False if __name__ == "__main__": @@ -117,25 +122,20 @@ def main(conda_fname, pip_fname, compare=False): action="store_true", help="compare whether the two files are equivalent", ) - argparser.add_argument( - "--azure", action="store_true", help="show the output in azure-pipelines format" - ) args = argparser.parse_args() - repo_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) - res = main( - os.path.join(repo_path, "environment.yml"), - os.path.join(repo_path, "requirements-dev.txt"), + conda_fname = "environment.yml" + pip_fname = "requirements-dev.txt" + repo_path = pathlib.Path(__file__).parent.parent.absolute() + res = generate_pip_from_conda( + pathlib.Path(repo_path, conda_fname), + pathlib.Path(repo_path, pip_fname), compare=args.compare, ) if res: msg = ( - f"`requirements-dev.txt` has to be generated with `{sys.argv[0]}` after " - "`environment.yml` is modified.\n" + f"`{pip_fname}` has to be generated with `{__file__}` after " + f"`{conda_fname}` is modified.\n" ) - if args.azure: - msg = ( - f"##vso[task.logissue type=error;sourcepath=requirements-dev.txt]{msg}" - ) sys.stderr.write(msg) sys.exit(res)