diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 31c926233d5b6..ffe615a63b7e3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: black - repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.4 + rev: 3.9.2 hooks: - id: flake8 additional_dependencies: [flake8-comprehensions>=3.1.0] diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 4cb4eaf95f6f5..2bb348a11655c 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -62,7 +62,7 @@ jobs: ENV_FILE: ci/deps/azure-38-numpydev.yaml CONDA_PY: "38" PATTERN: "not slow and not network" - TEST_ARGS: "-W error" + TEST_ARGS: "-W error -W \"ignore:Promotion of numbers and bools:FutureWarning\"" PANDAS_TESTING_MODE: "deprecate" EXTRA_APT: "xsel" diff --git a/ci/deps/actions-37-cov.yaml b/ci/deps/actions-37-cov.yaml index 5381caaa242cf..5d9ba70e37307 100644 --- a/ci/deps/actions-37-cov.yaml +++ b/ci/deps/actions-37-cov.yaml @@ -15,8 +15,8 @@ dependencies: - beautifulsoup4 - botocore>=1.11 - dask - - fastparquet>=0.4.0 - - fsspec>=0.7.4 + - fastparquet>=0.4.0, <=0.5.0 + - fsspec>=0.7.4, <2021.6.0 - gcsfs>=0.6.0 - geopandas - html5lib @@ -30,7 +30,7 @@ dependencies: - openpyxl - pandas-gbq - google-cloud-bigquery>=1.27.2 # GH 36436 - - psycopg2 + - psycopg2=2.8.6 - pyarrow>=0.15.0 - pymysql<0.10.0 # temporary pin, GH 36465 - pytables diff --git a/ci/deps/azure-37.yaml b/ci/deps/azure-37.yaml index 4fe3de161960c..13b46b4d28828 100644 --- a/ci/deps/azure-37.yaml +++ b/ci/deps/azure-37.yaml @@ -14,8 +14,8 @@ dependencies: # pandas dependencies - botocore>=1.11 - - fsspec>=0.7.4 - - numpy + - fsspec>=0.7.4, <2021.6.0 + - numpy=1.19 - python-dateutil - nomkl - pyarrow=0.15.1 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index e7ac4c783b855..dd67d9a9c1177 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.8.0 + - fsspec>=0.8.0, <2021.6.0 - gcsfs>=0.6.0 - html5lib - jinja2 diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 661d8813d32d2..fdea34d573340 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.4.0 + - fastparquet>=0.4.0, <=0.5.0 - flask - fsspec>=0.8.0 - matplotlib=3.1.3 diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/circle-37-arm64.yaml similarity index 100% rename from ci/deps/travis-37-arm64.yaml rename to ci/deps/circle-37-arm64.yaml diff --git a/ci/setup_env.sh b/ci/setup_env.sh index c36422884f2ec..e6bd9950331ca 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -12,41 +12,30 @@ if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo fi -MINICONDA_DIR="$HOME/miniconda3" - - -if [ -d "$MINICONDA_DIR" ]; then - echo - echo "rm -rf "$MINICONDA_DIR"" - rm -rf "$MINICONDA_DIR" -fi echo "Install Miniconda" -UNAME_OS=$(uname) -if [[ "$UNAME_OS" == 'Linux' ]]; then +DEFAULT_CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest" +if [[ "$(uname -m)" == 'aarch64' ]]; then + CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.10.1-4/Miniforge3-4.10.1-4-Linux-aarch64.sh" +elif [[ "$(uname)" == 'Linux' ]]; then if [[ "$BITS32" == "yes" ]]; then - CONDA_OS="Linux-x86" + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86.sh" else - CONDA_OS="Linux-x86_64" + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86_64.sh" fi -elif [[ "$UNAME_OS" == 'Darwin' ]]; then - CONDA_OS="MacOSX-x86_64" +elif [[ "$(uname)" == 'Darwin' ]]; then + CONDA_URL="$DEFAULT_CONDA_URL-MacOSX-x86_64.sh" else - echo "OS $UNAME_OS not supported" + echo "OS $(uname) not supported" exit 1 fi - -if [ "${TRAVIS_CPU_ARCH}" == "arm64" ]; then - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.5-1/Miniforge3-4.8.5-1-Linux-aarch64.sh" -else - CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-$CONDA_OS.sh" -fi +echo "Downloading $CONDA_URL" wget -q $CONDA_URL -O miniconda.sh chmod +x miniconda.sh -# Installation path is required for ARM64 platform as miniforge script installs in path $HOME/miniforge3. +MINICONDA_DIR="$HOME/miniconda3" +rm -rf $MINICONDA_DIR ./miniconda.sh -b -p $MINICONDA_DIR - export PATH=$MINICONDA_DIR/bin:$PATH echo diff --git a/doc/_templates/sidebar-nav-bs.html b/doc/_templates/sidebar-nav-bs.html new file mode 100644 index 0000000000000..7e0043e771e72 --- /dev/null +++ b/doc/_templates/sidebar-nav-bs.html @@ -0,0 +1,9 @@ + diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 8739694c20e33..e546c8c8b80e3 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.2 .. toctree:: :maxdepth: 2 + v1.2.5 v1.2.4 v1.2.3 v1.2.2 diff --git a/doc/source/whatsnew/v1.2.4.rst b/doc/source/whatsnew/v1.2.4.rst index dd74091b64014..433ee37508e66 100644 --- a/doc/source/whatsnew/v1.2.4.rst +++ b/doc/source/whatsnew/v1.2.4.rst @@ -30,4 +30,4 @@ Fixed regressions Contributors ~~~~~~~~~~~~ -.. contributors:: v1.2.3..v1.2.4|HEAD +.. contributors:: v1.2.3..v1.2.4 diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst new file mode 100644 index 0000000000000..d3ceb2b919b5d --- /dev/null +++ b/doc/source/whatsnew/v1.2.5.rst @@ -0,0 +1,31 @@ +.. _whatsnew_125: + +What's new in 1.2.5 (June 22, 2021) +----------------------------------- + +These are the changes in pandas 1.2.5. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_125.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`concat` between two :class:`DataFrame` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) +- Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) +- Fixed regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) +- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) +- Fixed regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`) +- Fixed regression in :meth:`DataFrame.astype` with ``dtype=str`` failing to convert ``NaN`` in categorical columns (:issue:`41797`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_125.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.2.4..v1.2.5|HEAD diff --git a/environment.yml b/environment.yml index a369f656cb575..ec3c31a15ee02 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: # required - numpy>=1.16.5 - - python=3 + - python=3.8 - python-dateutil>=2.7.3 - pytz @@ -20,7 +20,7 @@ dependencies: # code checks - black=20.8b1 - cpplint - - flake8 + - flake8=3.9.2 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - isort>=5.2.1 # check that imports are in the right order - mypy=0.782 @@ -77,7 +77,7 @@ dependencies: - bottleneck>=1.2.1 - ipykernel - ipython>=7.11.1 - - jinja2 # pandas.Styler + - jinja2<3.0.0 # pandas.Styler - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.2 @@ -97,14 +97,14 @@ dependencies: - xlwt - odfpy - - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.3.2, <=0.5.0 # pandas.read_parquet, DataFrame.to_parquet - pyarrow>=0.15.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.5.1 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - - fsspec>=0.7.4 # for generic remote file operations + - fsspec>=0.7.4, <2021.6.0 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 2ac9b9e2c875c..9aa261fd745d5 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -77,6 +77,18 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" +def is_platform_arm() -> bool: + """ + Checking if he running platform use ARM architecture. + + Returns + ------- + bool + True if the running platform uses ARM architecture. + """ + return platform.machine() in ("arm64", "aarch64") + + def import_lzma(): """ Importing the `lzma` module. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d5819697066ef..8b683d32f7b45 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -9,7 +9,7 @@ from pandas._config import get_option -from pandas._libs import NaT, algos as libalgos, hashtable as htable +from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv @@ -429,6 +429,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: try: new_cats = np.asarray(self.categories) new_cats = new_cats.astype(dtype=dtype, copy=copy) + fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, @@ -436,7 +437,11 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" raise ValueError(msg) - result = take_1d(new_cats, libalgos.ensure_platform_int(self._codes)) + result = take_1d( + new_cats, + libalgos.ensure_platform_int(self._codes), + fill_value=fill_value, + ) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c156d7470364..92892ac0f26e0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8786,7 +8786,6 @@ def _reduce( **kwds, ): - min_count = kwds.get("min_count", 0) assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None @@ -8831,7 +8830,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if (numeric_only is not None or axis == 0) and min_count == 0: + if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 11d191597d61e..25860d6a4ecb3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2742,7 +2742,8 @@ def _union(self, other, sort): # worth making this faster? a very unusual case value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) - result = Index(result)._values # do type inference here + # If objects are unorderable, we must have object dtype. + return np.array(result, dtype=object) else: # find indexes of things in "other" that are not in "self" if self.is_unique: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b6bca855a9f05..a38b7a19dc80a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -391,7 +391,7 @@ def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: return [] raise - if np.ndim(result) == 0: + if self.values.ndim == 1: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) else: @@ -860,6 +860,15 @@ def _replace_list( """ See BlockManager._replace_list docstring. """ + + # https://github.com/pandas-dev/pandas/issues/40371 + # the following pairs check code caused a regression so we catch that case here + # until the issue is fixed properly in can_hold_element + + # error: "Iterable[Any]" has no attribute "tolist" + if hasattr(src_list, "tolist"): + src_list = src_list.tolist() # type: ignore[attr-defined] + # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index edc1b1e96509e..d42497c3a8322 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1377,8 +1377,15 @@ def _maybe_null_out( Dtype The product of all elements on a given axis. ( NaNs are treated as 1) """ - if mask is not None and axis is not None and getattr(result, "ndim", False): - null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 + if axis is not None and isinstance(result, np.ndarray): + if mask is not None: + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 + else: + # we have no nulls, kept mask=None in _maybe_get_mask + below_count = shape[axis] - min_count < 0 + new_shape = shape[:axis] + shape[axis + 1 :] + null_mask = np.broadcast_to(below_count, new_shape) + if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): diff --git a/pandas/io/common.py b/pandas/io/common.py index be353fefdd1ef..e6b6471294ac7 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -568,7 +568,12 @@ def get_handle( # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( - handle, memory_map, ioargs.encoding, ioargs.mode, errors + handle, + memory_map, + ioargs.encoding, + ioargs.mode, + errors, + ioargs.compression["method"] not in _compression_to_extension, ) is_path = isinstance(handle, str) @@ -759,7 +764,18 @@ class _MMapWrapper(abc.Iterator): """ - def __init__(self, f: IO): + def __init__( + self, + f: IO, + encoding: str = "utf-8", + errors: str = "strict", + decode: bool = True, + ): + self.encoding = encoding + self.errors = errors + self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) + self.decode = decode + self.attributes = {} for attribute in ("seekable", "readable", "writeable"): if not hasattr(f, attribute): @@ -775,19 +791,31 @@ def __getattr__(self, name: str): def __iter__(self) -> "_MMapWrapper": return self + def read(self, size: int = -1) -> Union[str, bytes]: + # CSV c-engine uses read instead of iterating + content: bytes = self.mmap.read(size) + if self.decode: + errors = self.errors if self.errors is not None else "strict" + # memory mapping is applied before compression. Encoding should + # be applied to the de-compressed data. + return content.decode(self.encoding, errors=errors) + return content + def __next__(self) -> str: newbytes = self.mmap.readline() # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing - newline = newbytes.decode("utf-8") + newline = self.decoder.decode(newbytes) # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise # if that is returned if newline == "": raise StopIteration - return newline + + # IncrementalDecoder seems to push newline to the next line + return newline.lstrip("\n") def _maybe_memory_map( @@ -796,6 +824,7 @@ def _maybe_memory_map( encoding: str, mode: str, errors: Optional[str], + decode: bool, ) -> Tuple[FileOrBuffer, bool, List[Buffer]]: """Try to memory map file/buffer.""" handles: List[Buffer] = [] @@ -814,7 +843,10 @@ def _maybe_memory_map( handles.append(handle) try: - wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] + wrapped = cast( + mmap.mmap, + _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] + ) handle.close() handles.remove(handle) handles.append(wrapped) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 850570fc743b7..11dee69651dad 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -388,7 +388,11 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): elif hasattr(self.handles.handle, "read"): # N.B. xlrd.Book has a read attribute too self.handles.handle.seek(0) - self.book = self.load_workbook(self.handles.handle) + try: + self.book = self.load_workbook(self.handles.handle) + except Exception: + self.close() + raise elif isinstance(self.handles.handle, bytes): self.book = self.load_workbook(BytesIO(self.handles.handle)) else: @@ -406,6 +410,11 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): + if hasattr(self, "book") and hasattr(self.book, "close"): + # pyxlsb: opens a TemporaryFile + # openpyxl: https://stackoverflow.com/questions/31416842/ + # openpyxl-does-not-close-excel-workbook-in-read-only-mode + self.book.close() self.handles.close() @property diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index be1587dbc010c..cac6d8d1b8113 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -487,12 +487,6 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): filepath_or_buffer, read_only=True, data_only=True, keep_links=False ) - def close(self): - # https://stackoverflow.com/questions/31416842/ - # openpyxl-does-not-close-excel-workbook-in-read-only-mode - self.book.close() - super().close() - @property def sheet_names(self) -> List[str]: return self.book.sheetnames diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8ad86fd0a0dce..bbff9dfe1ddd0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1868,31 +1868,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds): assert self.handles is not None for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) - if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): - # pandas\io\parsers.py:1861: error: Item "IO[Any]" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "RawIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "BufferedIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "TextIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "TextIOWrapper" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "mmap" of "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]" has - # no attribute "mmap" [union-attr] - self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] try: self._reader = parsers.TextReader(self.handles.handle, **kwds) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index e26bb513838a5..d42da39ec8ff0 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -83,6 +83,7 @@ def xbox2(x): "Invalid comparison between", "Cannot compare type", "not supported between", + "could not be promoted", "invalid type promotion", ( # GH#36706 npdev 1.20.0 2020-09-28 diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index 6dc3b3b13dd0c..4473d86fa04a1 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -235,7 +235,7 @@ def test_compare_list_like_nan(self, op, array, nulls_fixture, request): Categorical(list("abab")), Categorical(date_range("2017-01-01", periods=4)), pd.array(list("abcd")), - pd.array(["foo", 3.14, None, object()]), + pd.array(["foo", 3.14, None, object()], dtype=object), ], ids=lambda x: str(x.dtype), ) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 9b854a81f2def..01de64568a011 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -66,10 +66,7 @@ def test_div(left_array, right_array): @pytest.mark.parametrize( "opname", [ - pytest.param( - "floordiv", - marks=pytest.mark.xfail(reason="NumpyDev GH#40874", strict=False), - ), + "floordiv", "mod", pytest.param( "pow", marks=pytest.mark.xfail(reason="TODO follow int8 behaviour? GH34686") diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 34686f6052131..148b7092abb56 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat.numpy import is_numpy_dev - import pandas as pd import pandas._testing as tm from pandas.core.arrays import ExtensionArray @@ -51,8 +49,6 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators): def test_array_NA(data, all_arithmetic_operators): if "truediv" in all_arithmetic_operators: pytest.skip("division with pd.NA raises") - if "floordiv" in all_arithmetic_operators and is_numpy_dev: - pytest.skip("NumpyDev behavior GH#40874") data, _ = data op = tm.get_op_from_name(all_arithmetic_operators) check_skip(data, all_arithmetic_operators) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index d15c822f22c14..86a0bc9213256 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -16,8 +16,6 @@ import numpy as np import pytest -from pandas.compat.numpy import is_numpy_dev - import pandas as pd import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype @@ -141,21 +139,6 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): with pytest.raises(exc): op(s, other) - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): - if "floordiv" in all_arithmetic_operators and is_numpy_dev: - pytest.skip("NumpyDev behavior GH#40874") - super().test_arith_series_with_scalar(data, all_arithmetic_operators) - - def test_arith_series_with_array(self, data, all_arithmetic_operators): - if "floordiv" in all_arithmetic_operators and is_numpy_dev: - pytest.skip("NumpyDev behavior GH#40874") - super().test_arith_series_with_scalar(data, all_arithmetic_operators) - - def test_divmod_series_array(self, data, data_for_twos): - if is_numpy_dev: - pytest.skip("NumpyDev behavior GH#40874") - super().test_divmod_series_array(data, data_for_twos) - def _check_divmod_op(self, s, op, other, exc=None): # override to not raise an error super()._check_divmod_op(s, op, other, None) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 5264d4b432d34..0d160966521d3 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -616,3 +616,11 @@ def test_astype_bytes(self): # GH#39474 result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3") + + def test_astype_categorical_to_string_missing(self): + # https://github.com/pandas-dev/pandas/issues/41797 + df = DataFrame(["a", "b", np.nan]) + expected = df.astype(str) + cat = df.astype("category") + result = cat.astype(str) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index c4f2e09911b34..0f85af6b26aa3 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1665,3 +1665,22 @@ def test_replace_bytes(self, frame_or_series): expected = obj.copy() obj = obj.replace({None: np.nan}) tm.assert_equal(obj, expected) + + @pytest.mark.parametrize( + "data, to_replace, value, expected", + [ + ([1], [1.0], [0], [0]), + ([1], [1], [0], [0]), + ([1.0], [1.0], [0], [0.0]), + ([1.0], [1], [0], [0.0]), + ], + ) + @pytest.mark.parametrize("box", [list, tuple, np.array]) + def test_replace_list_with_mixed_type( + self, data, to_replace, value, expected, box, frame_or_series + ): + # GH#40371 + obj = frame_or_series(data) + expected = frame_or_series(expected) + result = obj.replace(box(to_replace), value) + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e5ec3c5641bd2..51420859dc1bd 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -79,6 +79,7 @@ def check(df, df2): msgs = [ r"Invalid comparison between dtype=datetime64\[ns\] and ndarray", "invalid type promotion", + "could not be promoted", ( # npdev 1.20.0 r"The DTypes and " diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index cb481613eb97f..b6eccc6999dec 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1,5 +1,6 @@ from datetime import timedelta from decimal import Decimal +import re from dateutil.tz import tzlocal import numpy as np @@ -783,34 +784,35 @@ def test_sum_corner(self): assert len(axis1) == 0 @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) - def test_sum_prod_nanops(self, method, unit): + @pytest.mark.parametrize("numeric_only", [None, True, False]) + def test_sum_prod_nanops(self, method, unit, numeric_only): idx = ["a", "b", "c"] df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}) # The default - result = getattr(df, method) + result = getattr(df, method)(numeric_only=numeric_only) expected = Series([unit, unit, unit], index=idx, dtype="float64") # min_count=1 - result = getattr(df, method)(min_count=1) + result = getattr(df, method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, unit, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count=0 - result = getattr(df, method)(min_count=0) + result = getattr(df, method)(numeric_only=numeric_only, min_count=0) expected = Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) - result = getattr(df.iloc[1:], method)(min_count=1) + result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, np.nan, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count > 1 df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) - result = getattr(df, method)(min_count=5) + result = getattr(df, method)(numeric_only=numeric_only, min_count=5) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) - result = getattr(df, method)(min_count=6) + result = getattr(df, method)(numeric_only=numeric_only, min_count=6) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) @@ -1491,3 +1493,16 @@ def test_minmax_extensionarray(method, numeric_only): [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") ) tm.assert_series_equal(result, expected) + + +def test_prod_sum_min_count_mixed_object(): + # https://github.com/pandas-dev/pandas/issues/41074 + df = DataFrame([1, "a", True]) + + result = df.prod(axis=0, min_count=1, numeric_only=False) + expected = Series(["a"]) + tm.assert_series_equal(result, expected) + + msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") + with pytest.raises(TypeError, match=msg): + df.sum(axis=0, min_count=1, numeric_only=False) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index bd3bfa207c4b0..e4006d9068f2a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1,6 +1,7 @@ from datetime import datetime, time from functools import partial import os +from pathlib import Path from urllib.error import URLError from zipfile import BadZipFile @@ -1273,3 +1274,21 @@ def test_read_datetime_multiindex(self, engine, read_ext): expected = DataFrame([], columns=expected_column_index) tm.assert_frame_equal(expected, actual) + + def test_corrupt_files_closed(self, request, engine, read_ext): + # GH41778 + errors = (BadZipFile, ValueError) + if engine is None: + pytest.skip() + elif engine == "xlrd": + import xlrd + + errors = (BadZipFile, ValueError, xlrd.biffh.XLRDError) + + with tm.ensure_clean(f"corrupt{read_ext}") as file: + Path(file).write_text("corrupt") + with tm.assert_produces_warning(False): + try: + pd.ExcelFile(file, engine=engine) + except errors: + pass diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index e74265da3e966..41e1964086dce 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -217,3 +217,20 @@ def test_parse_encoded_special_characters(encoding): expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) +def test_encoding_memory_map(all_parsers, encoding): + # GH40986 + parser = all_parsers + expected = DataFrame( + { + "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"], + "mask": ["red", "purple", "orange", "blue"], + "weapon": ["sai", "bo staff", "nunchunk", "katana"], + } + ) + with tm.ensure_clean() as file: + expected.to_csv(file, index=False, encoding=encoding) + df = parser.read_csv(file, encoding=encoding, memory_map=True) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index cd58df4fc5da6..3b11a5dbda41c 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -572,3 +572,23 @@ def test_concat_repeated_keys(keys, integrity): tuples = list(zip(keys, ["a", "b", "c"])) expected = Series([1, 2, 3], index=MultiIndex.from_tuples(tuples)) tm.assert_series_equal(result, expected) + + +def test_concat_null_object_with_dti(): + # GH#40841 + dti = pd.DatetimeIndex( + ["2021-04-08 21:21:14+00:00"], dtype="datetime64[ns, UTC]", name="Time (UTC)" + ) + right = DataFrame(data={"C": [0.5274]}, index=dti) + + idx = Index([None], dtype="object", name="Maybe Time (UTC)") + left = DataFrame(data={"A": [None], "B": [np.nan]}, index=idx) + + result = concat([left, right], axis="columns") + + exp_index = Index([None, dti[0]], dtype=object) + expected = DataFrame( + {"A": [None, None], "B": [np.nan, np.nan], "C": [np.nan, 0.5274]}, + index=exp_index, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index e2cdf76d038ec..218d247a25380 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import UnsupportedFunctionCall from pandas import ( @@ -891,6 +892,7 @@ def test_rolling_sem(frame_or_series): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(is_platform_arm(), reason="GH 41740") @pytest.mark.parametrize( ("func", "third_value", "values"), [ diff --git a/requirements-dev.txt b/requirements-dev.txt index 595b2ee537e63..78aca0913b006 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,7 +8,7 @@ asv cython>=0.29.21 black==20.8b1 cpplint -flake8 +flake8==3.9.2 flake8-comprehensions>=3.1.0 isort>=5.2.1 mypy==0.782 @@ -49,7 +49,7 @@ blosc bottleneck>=1.2.1 ipykernel ipython>=7.11.1 -jinja2 +jinja2<3.0.0 matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.2 @@ -62,13 +62,13 @@ xlrd xlsxwriter xlwt odfpy -fastparquet>=0.3.2 +fastparquet>=0.3.2, <=0.5.0 pyarrow>=0.15.0 python-snappy pyqt5>=5.9.2 tables>=3.5.1 s3fs>=0.4.0 -fsspec>=0.7.4 +fsspec>=0.7.4, <2021.6.0 gcsfs>=0.6.0 sqlalchemy xarray diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 7e4c68ddc183b..cbf3e84044d53 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -165,7 +165,7 @@ def test_bad_class(self, capsys): "indentation_is_not_a_multiple_of_four", # with flake8 3.9.0, the message ends with four spaces, # whereas in earlier versions, it ended with "four" - ("flake8 error: E111 indentation is not a multiple of ",), + ("flake8 error: E111 indentation is not a multiple of 4",), ), ( "BadDocstrings",