diff --git a/.circleci/config.yml b/.circleci/config.yml
index ac9db5f451bf3..90afb1ce29684 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,6 +18,29 @@ jobs:
PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD
ci/run_tests.sh
+ linux-musl:
+ docker:
+ - image: quay.io/pypa/musllinux_1_1_aarch64
+ resource_class: arm.large
+ steps:
+ # Install pkgs first to have git in the image
+ # (needed for checkout)
+ - run: |
+ apk update
+ apk add git
+ apk add musl-locales
+ - checkout
+ - run: |
+ /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
+ . ~/virtualenvs/pandas-dev/bin/activate
+ python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
+ python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1
+ python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
+ python -m pip list --no-cache-dir
+ - run: |
+ . ~/virtualenvs/pandas-dev/bin/activate
+ export PANDAS_CI=1
+ python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml
build-aarch64:
parameters:
cibw-build:
@@ -46,9 +69,15 @@ jobs:
fi
- run:
name: Build aarch64 wheels
+ no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that
command: |
- pip3 install cibuildwheel==2.14.1
+ pip3 install cibuildwheel==2.15.0
+ # When this is a nightly wheel build, allow picking up NumPy 2.0 dev wheels:
+ if [[ "$IS_SCHEDULE_DISPATCH" == "true" || "$IS_PUSH" != 'true' ]]; then
+ export CIBW_ENVIRONMENT="PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
+ fi
cibuildwheel --prerelease-pythons --output-dir wheelhouse
+
environment:
CIBW_BUILD: << parameters.cibw-build >>
@@ -83,6 +112,13 @@ workflows:
equal: [ scheduled_pipeline, << pipeline.trigger_source >> ]
jobs:
- test-arm
+ test-musl:
+ # Don't run trigger this one when scheduled pipeline runs
+ when:
+ not:
+ equal: [ scheduled_pipeline, << pipeline.trigger_source >> ]
+ jobs:
+ - linux-musl
build-wheels:
jobs:
- build-aarch64:
@@ -91,5 +127,11 @@ workflows:
only: /^v.*/
matrix:
parameters:
- # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12
- cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"]#, "cp312-manylinux_aarch64"]
+ cibw-build: ["cp39-manylinux_aarch64",
+ "cp310-manylinux_aarch64",
+ "cp311-manylinux_aarch64",
+ "cp312-manylinux_aarch64",
+ "cp39-musllinux_aarch64",
+ "cp310-musllinux_aarch64",
+ "cp311-musllinux_aarch64",
+ "cp312-musllinux_aarch64",]
diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh
index 4f81acb6d2099..eef4db1191a9a 100755
--- a/.circleci/setup_env.sh
+++ b/.circleci/setup_env.sh
@@ -55,6 +55,6 @@ if pip show pandas 1>/dev/null; then
fi
echo "Install pandas"
-python -m pip install --no-build-isolation -ve .
+python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror"
echo "done"
diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
index 73d7723e2fb49..63f687324b0ae 100644
--- a/.github/actions/build_pandas/action.yml
+++ b/.github/actions/build_pandas/action.yml
@@ -4,6 +4,12 @@ inputs:
editable:
description: Whether to build pandas in editable mode (default true)
default: true
+ meson_args:
+ description: Extra flags to pass to meson
+ required: false
+ cflags_adds:
+ description: Items to append to the CFLAGS variable
+ required: false
runs:
using: composite
steps:
@@ -24,9 +30,12 @@ runs:
- name: Build Pandas
run: |
+ export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}"
if [[ ${{ inputs.editable }} == "true" ]]; then
- pip install -e . --no-build-isolation -v
+ pip install -e . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \
+ --config-settings=setup-args="--werror"
else
- pip install . --no-build-isolation -v
+ pip install . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \
+ --config-settings=setup-args="--werror"
fi
shell: bash -el {0}
diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml
index fd7c3587f2254..b4778b74df335 100644
--- a/.github/actions/run-tests/action.yml
+++ b/.github/actions/run-tests/action.yml
@@ -1,9 +1,16 @@
name: Run tests and report results
+inputs:
+ preload:
+ description: Preload arguments for sanitizer
+ required: false
+ asan_options:
+ description: Arguments for Address Sanitizer (ASAN)
+ required: false
runs:
using: composite
steps:
- name: Test
- run: ci/run_tests.sh
+ run: ${{ inputs.asan_options }} ${{ inputs.preload }} ci/run_tests.sh
shell: bash -el {0}
- name: Publish test results
diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml
index 84e81b9a9297f..ceeebfcd1c90c 100644
--- a/.github/actions/setup-conda/action.yml
+++ b/.github/actions/setup-conda/action.yml
@@ -11,6 +11,6 @@ runs:
with:
environment-file: ${{ inputs.environment-file }}
environment-name: test
- condarc-file: ci/condarc.yml
+ condarc-file: ci/.condarc
cache-environment: true
cache-downloads: true
diff --git a/.github/workflows/broken-linkcheck.yml b/.github/workflows/broken-linkcheck.yml
new file mode 100644
index 0000000000000..191252cccf1c3
--- /dev/null
+++ b/.github/workflows/broken-linkcheck.yml
@@ -0,0 +1,39 @@
+name: Linkcheck
+on:
+ schedule:
+ # Run monthly on the 1st day of the month
+ - cron: '0 0 1 * *'
+ pull_request:
+ paths:
+ - ".github/workflows/broken-linkcheck.yml"
+ - "doc/make.py"
+jobs:
+ linkcheck:
+ if: false
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ shell: bash -el {0}
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ - name: Set up Conda
+ uses: ./.github/actions/setup-conda
+
+ - name: Build Pandas
+ uses: ./.github/actions/build_pandas
+
+ - name: Run linkcheck script
+ working-directory: ./doc
+ run: |
+ set -o pipefail
+ python make.py linkcheck | tee linkcheck.txt
+
+ - name: Display broken links
+ if: failure()
+ working-directory: ./doc
+ run: grep broken linkcheck.txt
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 0954bf7d9231c..8e29d56f47dcf 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -4,11 +4,11 @@ on:
push:
branches:
- main
- - 2.0.x
+ - 2.2.x
pull_request:
branches:
- main
- - 2.0.x
+ - 2.2.x
env:
ENV_FILE: environment.yml
@@ -33,7 +33,7 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -109,7 +109,7 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -124,7 +124,7 @@ jobs:
run: |
cd asv_bench
asv machine --yes
- asv run --quick --dry-run --strict --durations=30 --python=same
+ asv run --quick --dry-run --durations=30 --python=same --show-stderr
build_docker_dev_environment:
name: Build Docker Dev Environment
@@ -143,7 +143,7 @@ jobs:
run: docker image prune -f
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -164,13 +164,13 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Python
id: setup_python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 8715c5306a3b0..4d0066bc0b48d 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -27,9 +27,9 @@ jobs:
- python
steps:
- - uses: actions/checkout@v3
- - uses: github/codeql-action/init@v2
+ - uses: actions/checkout@v4
+ - uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
- - uses: github/codeql-action/autobuild@v2
- - uses: github/codeql-action/analyze@v2
+ - uses: github/codeql-action/autobuild@v3
+ - uses: github/codeql-action/analyze@v3
diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml
index 2550d4de34a45..425abef850184 100644
--- a/.github/workflows/comment-commands.yml
+++ b/.github/workflows/comment-commands.yml
@@ -51,7 +51,7 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -77,7 +77,7 @@ jobs:
echo 'EOF' >> $GITHUB_ENV
echo "REGEX=$REGEX" >> $GITHUB_ENV
- - uses: actions/github-script@v6
+ - uses: actions/github-script@v7
env:
BENCH_OUTPUT: ${{env.BENCH_OUTPUT}}
REGEX: ${{env.REGEX}}
diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml
index b3f9bcd840c68..ec71daf6f84ab 100644
--- a/.github/workflows/deprecation-tracking-bot.yml
+++ b/.github/workflows/deprecation-tracking-bot.yml
@@ -21,7 +21,7 @@ jobs:
env:
DEPRECATION_TRACKER_ISSUE: 50578
steps:
- - uses: actions/github-script@v6
+ - uses: actions/github-script@v7
id: update-deprecation-issue
with:
script: |
diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml
index 9840596357d26..73acd9acc129a 100644
--- a/.github/workflows/docbuild-and-upload.yml
+++ b/.github/workflows/docbuild-and-upload.yml
@@ -4,13 +4,13 @@ on:
push:
branches:
- main
- - 2.0.x
+ - 2.2.x
tags:
- '*'
pull_request:
branches:
- main
- - 2.0.x
+ - 2.2.x
env:
ENV_FILE: environment.yml
@@ -36,7 +36,7 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -46,6 +46,9 @@ jobs:
- name: Build Pandas
uses: ./.github/actions/build_pandas
+ - name: Test website
+ run: python -m pytest web/
+
- name: Build website
run: python web/pandas_web.py web/pandas --target-path=web/build
@@ -67,7 +70,7 @@ jobs:
run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/
- name: Upload web
- run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html
+ run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='benchmarks' web/build/ web@${{ secrets.server_ip }}:/var/www/html
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
- name: Upload dev docs
@@ -82,7 +85,7 @@ jobs:
run: mv doc/build/html web/build/docs
- name: Save website as an artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: website
path: web/build
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
index 6c9ba89d82794..d59ddf272f705 100644
--- a/.github/workflows/package-checks.yml
+++ b/.github/workflows/package-checks.yml
@@ -4,11 +4,11 @@ on:
push:
branches:
- main
- - 2.0.x
+ - 2.2.x
pull_request:
branches:
- main
- - 2.0.x
+ - 2.2.x
types: [ labeled, opened, synchronize, reopened ]
permissions:
@@ -24,7 +24,7 @@ jobs:
runs-on: ubuntu-22.04
strategy:
matrix:
- extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "consortium-standard", "all"]
+ extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"]
fail-fast: false
name: Install Extras - ${{ matrix.extra }}
concurrency:
@@ -34,13 +34,13 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Python
id: setup_python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.10'
@@ -62,7 +62,7 @@ jobs:
cancel-in-progress: true
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml
index 11b81d11f7876..792afe8f4faf5 100644
--- a/.github/workflows/stale-pr.yml
+++ b/.github/workflows/stale-pr.yml
@@ -14,7 +14,7 @@ jobs:
if: github.repository_owner == 'pandas-dev'
runs-on: ubuntu-22.04
steps:
- - uses: actions/stale@v8
+ - uses: actions/stale@v9
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please [update](https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#updating-your-pull-request) and respond to this comment if you're still interested in working on this."
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 1770d18d4eb41..a3cffb4b03b93 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -4,11 +4,11 @@ on:
push:
branches:
- main
- - 2.0.x
+ - 2.2.x
pull_request:
branches:
- main
- - 2.0.x
+ - 2.2.x
paths-ignore:
- "doc/**"
- "web/**"
@@ -23,10 +23,10 @@ defaults:
jobs:
ubuntu:
runs-on: ubuntu-22.04
- timeout-minutes: 180
+ timeout-minutes: 90
strategy:
matrix:
- env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+ env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
# Prevent the include jobs from overriding other jobs
pattern: [""]
include:
@@ -69,6 +69,22 @@ jobs:
env_file: actions-311.yaml
pattern: "not slow and not network and not single_cpu"
pandas_copy_on_write: "1"
+ - name: "Copy-on-Write 3.12"
+ env_file: actions-312.yaml
+ pattern: "not slow and not network and not single_cpu"
+ pandas_copy_on_write: "1"
+ - name: "Copy-on-Write 3.11 (warnings)"
+ env_file: actions-311.yaml
+ pattern: "not slow and not network and not single_cpu"
+ pandas_copy_on_write: "warn"
+ - name: "Copy-on-Write 3.10 (warnings)"
+ env_file: actions-310.yaml
+ pattern: "not slow and not network and not single_cpu"
+ pandas_copy_on_write: "warn"
+ - name: "Copy-on-Write 3.9 (warnings)"
+ env_file: actions-39.yaml
+ pattern: "not slow and not network and not single_cpu"
+ pandas_copy_on_write: "warn"
- name: "Pypy"
env_file: actions-pypy-39.yaml
pattern: "not slow and not network and not single_cpu"
@@ -76,29 +92,38 @@ jobs:
- name: "Numpy Dev"
env_file: actions-311-numpydev.yaml
pattern: "not slow and not network and not single_cpu"
- test_args: "-W error::DeprecationWarning -W error::FutureWarning"
- # TODO(cython3): Re-enable once next-beta(after beta 1) comes out
- # There are some warnings failing the build with -werror
- pandas_ci: "0"
+ # Currently restricted the warnings that error to Deprecation Warnings from numpy
+ # done since pyarrow isn't compatible with numpydev always
+ # TODO: work with pyarrow to revert this?
+ test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy"
- name: "Pyarrow Nightly"
env_file: actions-311-pyarrownightly.yaml
pattern: "not slow and not network and not single_cpu"
+ - name: "ASAN / UBSAN"
+ env_file: actions-311-sanitizers.yaml
+ pattern: "not slow and not network and not single_cpu and not skip_ubsan"
+ asan_options: "ASAN_OPTIONS=detect_leaks=0"
+ preload: LD_PRELOAD=$(gcc -print-file-name=libasan.so)
+ meson_args: --config-settings=setup-args="-Db_sanitize=address,undefined"
+ cflags_adds: -fno-sanitize-recover=all
+ pytest_workers: -1 # disable pytest-xdist as it swallows stderr from ASAN
fail-fast: false
name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
env:
- ENV_FILE: ci/deps/${{ matrix.env_file }}
PATTERN: ${{ matrix.pattern }}
- EXTRA_APT: ${{ matrix.extra_apt || '' }}
LANG: ${{ matrix.lang || 'C.UTF-8' }}
LC_ALL: ${{ matrix.lc_all || '' }}
PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
TEST_ARGS: ${{ matrix.test_args || '' }}
- PYTEST_WORKERS: 'auto'
+ PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }}
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
+ NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }}
+ # Clipboard tests
+ QT_QPA_PLATFORM: offscreen
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}
cancel-in-progress: true
services:
@@ -140,49 +165,57 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Extra installs
- # xsel for clipboard tests
- run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }}
+ run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }}
+ if: ${{ matrix.extra_apt }}
- name: Generate extra locales
# These extra locales will be available for locale.setlocale() calls in tests
- run: |
- sudo locale-gen ${{ matrix.extra_loc }}
+ run: sudo locale-gen ${{ matrix.extra_loc }}
if: ${{ matrix.extra_loc }}
- name: Set up Conda
uses: ./.github/actions/setup-conda
with:
- environment-file: ${{ env.ENV_FILE }}
+ environment-file: ci/deps/${{ matrix.env_file }}
- name: Build Pandas
id: build
uses: ./.github/actions/build_pandas
+ with:
+ meson_args: ${{ matrix.meson_args }}
+ cflags_adds: ${{ matrix.cflags_adds }}
- name: Test (not single_cpu)
uses: ./.github/actions/run-tests
if: ${{ matrix.name != 'Pypy' }}
+ with:
+ preload: ${{ matrix.preload }}
+ asan_options: ${{ matrix.asan_options }}
env:
# Set pattern to not single_cpu if not already set
PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }}
- name: Test (single_cpu)
uses: ./.github/actions/run-tests
+ with:
+ preload: ${{ matrix.preload }}
+ asan_options: ${{ matrix.asan_options }}
env:
PATTERN: 'single_cpu'
PYTEST_WORKERS: 0
if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}}
macos-windows:
- timeout-minutes: 180
+ timeout-minutes: 90
strategy:
matrix:
os: [macos-latest, windows-latest]
- env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+ env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
fail-fast: false
runs-on: ${{ matrix.os }}
name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
@@ -199,7 +232,7 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -235,12 +268,14 @@ jobs:
git -c user.email="you@example.com" merge --no-commit my_ref_name
fi
- name: Build environment and Run Tests
+ # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388
run: |
/opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
- python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1
- python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
- python -m pip install --no-cache-dir --no-build-isolation -e .
+ python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
+ python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true"
+ python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1
+ python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
export PANDAS_CI=1
python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml
@@ -276,9 +311,9 @@ jobs:
run: |
/opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
- python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1
- python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
- python -m pip install --no-cache-dir --no-build-isolation -e .
+ python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
+ python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1
+ python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
- name: Run Tests
@@ -311,18 +346,17 @@ jobs:
# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
# to the corresponding posix/windows-macos/sdist etc. workflows.
# Feel free to modify this comment as necessary.
- #if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+ if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+ defaults:
+ run:
+ shell: bash -eou pipefail {0}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
- # TODO: Disable macOS for now, Github Actions bug where python is not
- # symlinked correctly to 3.12
- # xref https://github.com/actions/setup-python/issues/701
- #os: [ubuntu-22.04, macOS-latest, windows-latest]
- os: [ubuntu-22.04, windows-latest]
+ os: [ubuntu-22.04, macOS-latest, windows-latest]
- timeout-minutes: 180
+ timeout-minutes: 90
concurrency:
#https://github.community/t/concurrecy-not-work-for-push/183068/7
@@ -336,31 +370,24 @@ jobs:
PYTEST_TARGET: pandas
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python Dev Version
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.12-dev'
- - name: Install dependencies
+ - name: Build Environment
run: |
python --version
- python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1
+ python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1
python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
python -m pip install versioneer[toml]
- python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
+ python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov
+ python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror"
python -m pip list
- - name: Build Pandas
- run: |
- python -m pip install -ve . --no-build-isolation --no-index
-
- - name: Build Version
- run: |
- python -c "import pandas; pandas.show_versions();"
-
- - name: Test
+ - name: Run Tests
uses: ./.github/actions/run-tests
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index f486a57bdb67d..841559c8e9799 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -48,12 +48,12 @@ jobs:
sdist_file: ${{ steps.save-path.outputs.sdist_name }}
steps:
- name: Checkout pandas
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.11'
@@ -62,7 +62,7 @@ jobs:
python -m pip install build
python -m build --sdist
- - uses: actions/upload-artifact@v3
+ - uses: actions/upload-artifact@v4
with:
name: sdist
path: ./dist/*
@@ -97,14 +97,13 @@ jobs:
- [macos-12, macosx_*]
- [windows-2022, win_amd64]
# TODO: support PyPy?
- # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12
- python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]#, ["cp312", "3.12"]]
+ python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]]
env:
IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
steps:
- name: Checkout pandas
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -116,7 +115,7 @@ jobs:
# removes unnecessary files from the release
- name: Download sdist (not macOS)
#if: ${{ matrix.buildplat[1] != 'macosx_*' }}
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: sdist
path: ./dist
@@ -138,20 +137,35 @@ jobs:
shell: bash -el {0}
run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
- - name: Build wheels
- uses: pypa/cibuildwheel@v2.14.1
+ - name: Build normal wheels
+ if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }}
+ uses: pypa/cibuildwheel@v2.16.2
with:
package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
env:
CIBW_PRERELEASE_PYTHONS: True
CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
+ - name: Build nightly wheels (with NumPy pre-release)
+ if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }}
+ uses: pypa/cibuildwheel@v2.16.2
+ with:
+ package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
+ env:
+ # The nightly wheels should be build witht he NumPy 2.0 pre-releases
+ # which requires the additional URL.
+ CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple
+ CIBW_PRERELEASE_PYTHONS: True
+ CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
+
- name: Set up Python
uses: mamba-org/setup-micromamba@v1
with:
environment-name: wheel-env
+ # Use a fixed Python, since we might have an unreleased Python not
+ # yet present on conda-forge
create-args: >-
- python=${{ matrix.python[1] }}
+ python=3.11
anaconda-client
wheel
cache-downloads: true
@@ -167,14 +181,15 @@ jobs:
shell: pwsh
run: |
$TST_CMD = @"
- python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17;
- python -m pip install --find-links=pandas\wheelhouse --no-index pandas;
+ python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0;
+ python -m pip install `$(Get-Item pandas\wheelhouse\*.whl);
python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`';
"@
- docker pull python:${{ matrix.python[1] }}-windowsservercore
- docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD
+ # add rc to the end of the image name if the Python version is unreleased
+ docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }}
+ docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD
- - uses: actions/upload-artifact@v3
+ - uses: actions/upload-artifact@v4
with:
name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }}
path: ./wheelhouse/*.whl
diff --git a/.gitignore b/.gitignore
index cd22c2bb8cb5b..a188e216d9f70 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@
MANIFEST
compile_commands.json
debug
+.debug
# Python files #
################
@@ -104,10 +105,11 @@ scikits
# Generated Sources #
#####################
!skts.c
-!np_datetime.c
-!np_datetime_strings.c
*.c
*.cpp
+!pandas/_libs/src/**/*.c
+!pandas/_libs/src/**/*.h
+!pandas/_libs/include/**/*.h
# Unit / Performance Testing #
##############################
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 90627216a1354..4b02ad7cf886f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,11 +20,11 @@ ci:
repos:
- repo: https://github.com/hauntsaninja/black-pre-commit-mirror
# black compiled with mypyc
- rev: 23.7.0
+ rev: 23.11.0
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.0.282
+ rev: v0.1.6
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
@@ -32,27 +32,29 @@ repos:
# TODO: remove autofixe-only rules when they are checked by ruff
name: ruff-selected-autofixes
alias: ruff-selected-autofixes
- args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix]
+ files: ^pandas
+ exclude: ^pandas/tests
+ args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
- repo: https://github.com/jendrikseipp/vulture
- rev: 'v2.7'
+ rev: 'v2.10'
hooks:
- id: vulture
entry: python scripts/run_vulture.py
pass_filenames: true
require_serial: false
- repo: https://github.com/codespell-project/codespell
- rev: v2.2.5
+ rev: v2.2.6
hooks:
- id: codespell
types_or: [python, rst, markdown, cython, c]
additional_dependencies: [tomli]
- repo: https://github.com/MarcoGorelli/cython-lint
- rev: v0.15.0
+ rev: v0.16.0
hooks:
- id: cython-lint
- id: double-quote-cython-strings
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.4.0
+ rev: v4.5.0
hooks:
- id: check-ast
- id: check-case-conflict
@@ -70,21 +72,8 @@ repos:
- id: fix-encoding-pragma
args: [--remove]
- id: trailing-whitespace
-- repo: https://github.com/cpplint/cpplint
- rev: 1.6.1
- hooks:
- - id: cpplint
- exclude: ^pandas/_libs/include/pandas/vendored/klib
- args: [
- --quiet,
- '--extensions=c,h',
- '--headers=h',
- --recursive,
- --linelength=88,
- '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
- ]
- repo: https://github.com/pylint-dev/pylint
- rev: v3.0.0a6
+ rev: v3.0.1
hooks:
- id: pylint
stages: [manual]
@@ -107,7 +96,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/asottile/pyupgrade
- rev: v3.10.1
+ rev: v3.15.0
hooks:
- id: pyupgrade
args: [--py39-plus]
@@ -124,9 +113,16 @@ repos:
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- repo: https://github.com/sphinx-contrib/sphinx-lint
- rev: v0.6.7
+ rev: v0.9.1
hooks:
- id: sphinx-lint
+- repo: https://github.com/pre-commit/mirrors-clang-format
+ rev: v17.0.6
+ hooks:
+ - id: clang-format
+ files: ^pandas/_libs/src|^pandas/_libs/include
+ args: [-i]
+ types_or: [c, c++]
- repo: local
hooks:
- id: pyright
@@ -138,11 +134,11 @@ repos:
types: [python]
stages: [manual]
additional_dependencies: &pyright_dependencies
- - pyright@1.1.318
+ - pyright@1.1.339
- id: pyright
# note: assumes python env is setup and activated
name: pyright reportGeneralTypeIssues
- entry: pyright --skipunannotated -p pyright_reportGeneralTypeIssues.json --level warning
+ entry: pyright -p pyright_reportGeneralTypeIssues.json --level warning
language: node
pass_filenames: false
types: [python]
@@ -246,8 +242,9 @@ repos:
# pytest raises without context
|\s\ pytest.raises
+ # TODO
# pytest.warns (use tm.assert_produces_warning instead)
- |pytest\.warns
+ # |pytest\.warns
# os.remove
|os\.remove
@@ -361,18 +358,6 @@ repos:
files: ^pandas/
exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py)
types: [python]
- - id: future-annotations
- name: import annotations from __future__
- entry: 'from __future__ import annotations'
- language: pygrep
- args: [--negate]
- files: ^pandas/
- types: [python]
- exclude: |
- (?x)
- /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$
- |/tests/
- |/_testing/
- id: check-test-naming
name: check that test names start with 'test'
entry: python -m scripts.check_test_naming
diff --git a/LICENSES/BOTTLENECK_LICENCE b/LICENSES/BOTTLENECK_LICENCE
new file mode 100644
index 0000000000000..f4bdbb1647ee6
--- /dev/null
+++ b/LICENSES/BOTTLENECK_LICENCE
@@ -0,0 +1,25 @@
+Copyright (c) 2010-2019 Keith Goodman
+Copyright (c) 2019 Bottleneck Developers
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/LICENSES/DATEUTIL_LICENSE b/LICENSES/DATEUTIL_LICENSE
index 6053d35cfc60b..1e65815cf0b31 100644
--- a/LICENSES/DATEUTIL_LICENSE
+++ b/LICENSES/DATEUTIL_LICENSE
@@ -51,4 +51,4 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-The above BSD License Applies to all code, even that also covered by Apache 2.0.
+The above BSD License Applies to all code, even that also covered by Apache 2.0.
\ No newline at end of file
diff --git a/LICENSES/KLIB_LICENSE b/LICENSES/KLIB_LICENSE
index 0a996fae3360f..2de13e402643b 100644
--- a/LICENSES/KLIB_LICENSE
+++ b/LICENSES/KLIB_LICENSE
@@ -20,4 +20,4 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
\ No newline at end of file
diff --git a/LICENSES/MUSL_LICENSE b/LICENSES/MUSL_LICENSE
index a8833d4bc4744..5ff71b5f5a7fe 100644
--- a/LICENSES/MUSL_LICENSE
+++ b/LICENSES/MUSL_LICENSE
@@ -1,7 +1,7 @@
musl as a whole is licensed under the following standard MIT license:
----------------------------------------------------------------------
-Copyright © 2005-2014 Rich Felker, et al.
+Copyright © 2005-2020 Rich Felker, et al.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@@ -25,37 +25,88 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Authors/contributors include:
+A. Wilcox
+Ada Worcester
+Alex Dowad
+Alex Suykov
+Alexander Monakov
+Andre McCurdy
+Andrew Kelley
Anthony G. Basile
+Aric Belsito
Arvid Picciani
+Bartosz Brachaczek
+Benjamin Peterson
Bobby Bingham
Boris Brezillon
Brent Cook
Chris Spiegel
Clément Vasseur
+Daniel Micay
+Daniel Sabogal
+Daurnimator
+David Carlier
+David Edelsohn
+Denys Vlasenko
+Dmitry Ivanov
+Dmitry V. Levin
+Drew DeVault
Emil Renner Berthing
+Fangrui Song
+Felix Fietkau
+Felix Janda
+Gianluca Anzolin
+Hauke Mehrtens
+He X
Hiltjo Posthuma
Isaac Dunham
+Jaydeep Patil
Jens Gustedt
Jeremy Huntwork
+Jo-Philipp Wich
+Joakim Sindholt
John Spencer
+Julien Ramseier
Justin Cormack
+Kaarle Ritvanen
+Khem Raj
+Kylie McClain
+Leah Neukirchen
Luca Barbato
Luka Perkov
M Farkas-Dyck (Strake)
+Mahesh Bodapati
+Markus Wichmann
+Masanori Ogino
+Michael Clark
Michael Forney
+Mikhail Kremnyov
+Natanael Copa
Nicholas J. Kain
orc
Pascal Cuoq
+Patrick Oppenlander
+Petr Hosek
+Petr Skocik
Pierre Carrier
+Reini Urban
Rich Felker
Richard Pennington
+Ryan Fairfax
+Samuel Holland
+Segev Finer
+Shiz
sin
Solar Designer
Stefan Kristiansson
+Stefan O'Rear
Szabolcs Nagy
Timo Teräs
+Trutz Behn
Valentin Ochs
+Will Dietz
William Haddon
+William Pitcock
Portions of this software are derived from third-party works licensed
under terms compatible with the above MIT license:
@@ -71,18 +122,22 @@ Copyright © 1993,2004 Sun Microsystems or
Copyright © 2003-2011 David Schultz or
Copyright © 2003-2009 Steven G. Kargl or
Copyright © 2003-2009 Bruce D. Evans or
-Copyright © 2008 Stephen L. Moshier
+Copyright © 2008 Stephen L. Moshier or
+Copyright © 2017-2018 Arm Limited
and labelled as such in comments in the individual source files. All
have been licensed under extremely permissive terms.
-The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008
+The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
The Android Open Source Project and is licensed under a two-clause BSD
license. It was taken from Bionic libc, used on Android.
-The implementation of DES for crypt (src/misc/crypt_des.c) is
+The AArch64 memcpy and memset code (src/string/aarch64/*) are
+Copyright © 1999-2019, Arm Limited.
+
+The implementation of DES for crypt (src/crypt/crypt_des.c) is
Copyright © 1994 David Burren. It is licensed under a BSD license.
-The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was
+The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was
originally written by Solar Designer and placed into the public
domain. The code also comes with a fallback permissive license for use
in jurisdictions that may not recognize the public domain.
@@ -90,22 +145,17 @@ in jurisdictions that may not recognize the public domain.
The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011
Valentin Ochs and is licensed under an MIT-style license.
-The BSD PRNG implementation (src/prng/random.c) and XSI search API
-(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and
-licensed under following terms: "Permission to use, copy, modify,
-and/or distribute this code for any purpose with or without fee is
-hereby granted. There is no warranty."
-
-The x86_64 port was written by Nicholas J. Kain. Several files (crt)
-were released into the public domain; others are licensed under the
-standard MIT license terms at the top of this file. See individual
-files for their copyright status.
+The x86_64 port was written by Nicholas J. Kain and is licensed under
+the standard MIT terms.
The mips and microblaze ports were originally written by Richard
Pennington for use in the ellcc project. The original code was adapted
by Rich Felker for build system and code conventions during upstream
integration. It is licensed under the standard MIT terms.
+The mips64 port was contributed by Imagination Technologies and is
+licensed under the standard MIT terms.
+
The powerpc port was also originally written by Richard Pennington,
and later supplemented and integrated by John Spencer. It is licensed
under the standard MIT terms.
@@ -118,15 +168,26 @@ can be found in the git version control history of the project. The
omission of copyright and license comments in each file is in the
interest of source tree size.
-All public header files (include/* and arch/*/bits/*) should be
-treated as Public Domain as they intentionally contain no content
-which can be covered by copyright. Some source modules may fall in
-this category as well. If you believe that a file is so trivial that
-it should be in the Public Domain, please contact the authors and
-request an explicit statement releasing it from copyright.
+In addition, permission is hereby granted for all public header files
+(include/* and arch/*/bits/*) and crt files intended to be linked into
+applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit
+the copyright notice and permission notice otherwise required by the
+license, and to use these files without any requirement of
+attribution. These files include substantial contributions from:
+
+Bobby Bingham
+John Spencer
+Nicholas J. Kain
+Rich Felker
+Richard Pennington
+Stefan Kristiansson
+Szabolcs Nagy
-The following files are trivial, believed not to be copyrightable in
-the first place, and hereby explicitly released to the Public Domain:
+all of whom have explicitly granted such permission.
-All public headers: include/*, arch/*/bits/*
-Startup files: crt/*
+This file previously contained text expressing a belief that most of
+the files covered by the above exception were sufficiently trivial not
+to be subject to copyright, resulting in confusion over whether it
+negated the permissions granted in the license. In the spirit of
+permissive licensing, and of not having licensing issues being an
+obstacle to adoption, that text has been removed.
\ No newline at end of file
diff --git a/LICENSES/NUMPY_LICENSE b/LICENSES/NUMPY_LICENSE
index 7e972cff80759..f2d647bf0bc48 100644
--- a/LICENSES/NUMPY_LICENSE
+++ b/LICENSES/NUMPY_LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2005-2011, NumPy Developers.
+Copyright (c) 2005-2023, NumPy Developers.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -27,4 +27,4 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/LICENSES/OTHER b/LICENSES/OTHER
deleted file mode 100644
index e156152990cc4..0000000000000
--- a/LICENSES/OTHER
+++ /dev/null
@@ -1,57 +0,0 @@
-Bottleneck license
-------------------
-
-Copyright (c) 2010-2012 Archipel Asset Management AB.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-Pyperclip v1.3 license
-----------------------
-
-Copyright (c) 2010, Albert Sweigart
-All rights reserved.
-
-BSD-style license:
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the pyperclip nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY
-EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/LICENSES/PACKAGING_LICENSE b/LICENSES/PACKAGING_LICENSE
index 4216ea1ce2379..2bfcd5297c470 100644
--- a/LICENSES/PACKAGING_LICENSE
+++ b/LICENSES/PACKAGING_LICENSE
@@ -199,4 +199,4 @@ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/LICENSES/PSF_LICENSE b/LICENSES/PSF_LICENSE
index 5cdb01e8d24af..f26bcf4d2de6e 100644
--- a/LICENSES/PSF_LICENSE
+++ b/LICENSES/PSF_LICENSE
@@ -2,25 +2,24 @@ A. HISTORY OF THE SOFTWARE
==========================
Python was created in the early 1990s by Guido van Rossum at Stichting
-Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands
as a successor of a language called ABC. Guido remains Python's
principal author, although it includes many contributions from others.
In 1995, Guido continued his work on Python at the Corporation for
-National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+National Research Initiatives (CNRI, see https://www.cnri.reston.va.us)
in Reston, Virginia where he released several versions of the
software.
In May 2000, Guido and the Python core development team moved to
BeOpen.com to form the BeOpen PythonLabs team. In October of the same
-year, the PythonLabs team moved to Digital Creations (now Zope
-Corporation, see http://www.zope.com). In 2001, the Python Software
-Foundation (PSF, see http://www.python.org/psf/) was formed, a
-non-profit organization created specifically to own Python-related
-Intellectual Property. Zope Corporation is a sponsoring member of
-the PSF.
-
-All Python releases are Open Source (see http://www.opensource.org for
+year, the PythonLabs team moved to Digital Creations, which became
+Zope Corporation. In 2001, the Python Software Foundation (PSF, see
+https://www.python.org/psf/) was formed, a non-profit organization
+created specifically to own Python-related Intellectual Property.
+Zope Corporation was a sponsoring member of the PSF.
+
+All Python releases are Open Source (see https://opensource.org for
the Open Source Definition). Historically, most, but not all, Python
releases have also been GPL-compatible; the table below summarizes
the various releases.
@@ -36,34 +35,9 @@ the various releases.
2.1 2.0+1.6.1 2001 PSF no
2.0.1 2.0+1.6.1 2001 PSF yes
2.1.1 2.1+2.0.1 2001 PSF yes
- 2.2 2.1.1 2001 PSF yes
2.1.2 2.1.1 2002 PSF yes
2.1.3 2.1.2 2002 PSF yes
- 2.2.1 2.2 2002 PSF yes
- 2.2.2 2.2.1 2002 PSF yes
- 2.2.3 2.2.2 2003 PSF yes
- 2.3 2.2.2 2002-2003 PSF yes
- 2.3.1 2.3 2002-2003 PSF yes
- 2.3.2 2.3.1 2002-2003 PSF yes
- 2.3.3 2.3.2 2002-2003 PSF yes
- 2.3.4 2.3.3 2004 PSF yes
- 2.3.5 2.3.4 2005 PSF yes
- 2.4 2.3 2004 PSF yes
- 2.4.1 2.4 2005 PSF yes
- 2.4.2 2.4.1 2005 PSF yes
- 2.4.3 2.4.2 2006 PSF yes
- 2.4.4 2.4.3 2006 PSF yes
- 2.5 2.4 2006 PSF yes
- 2.5.1 2.5 2007 PSF yes
- 2.5.2 2.5.1 2008 PSF yes
- 2.5.3 2.5.2 2008 PSF yes
- 2.6 2.5 2008 PSF yes
- 2.6.1 2.6 2008 PSF yes
- 2.6.2 2.6.1 2009 PSF yes
- 2.6.3 2.6.2 2009 PSF yes
- 2.6.4 2.6.3 2009 PSF yes
- 2.6.5 2.6.4 2010 PSF yes
- 2.7 2.6 2010 PSF yes
+ 2.2 and above 2.1.1 2001-now PSF yes
Footnotes:
@@ -85,6 +59,17 @@ direction to make these releases possible.
B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
===============================================================
+Python software and documentation are licensed under the
+Python Software Foundation License Version 2.
+
+Starting with Python 3.8.6, examples, recipes, and other code in
+the documentation are dual licensed under the PSF License Version 2
+and the Zero-Clause BSD license.
+
+Some software incorporated into Python is under different licenses.
+The licenses are listed with code falling under that license.
+
+
PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
--------------------------------------------
@@ -98,9 +83,10 @@ grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
analyze, test, perform and/or display publicly, prepare derivative works,
distribute, and otherwise use Python alone or in any derivative version,
provided, however, that PSF's License Agreement and PSF's notice of copyright,
-i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
-Python Software Foundation; All Rights Reserved" are retained in Python alone or
-in any derivative version prepared by Licensee.
+i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
+All Rights Reserved" are retained in Python alone or in any derivative version
+prepared by Licensee.
3. In the event Licensee prepares a derivative work that is based on
or incorporates Python or any part thereof, and wants to make
@@ -205,9 +191,9 @@ version prepared by Licensee. Alternately, in lieu of CNRI's License
Agreement, Licensee may substitute the following text (omitting the
quotes): "Python 1.6.1 is made available subject to the terms and
conditions in CNRI's License Agreement. This Agreement together with
-Python 1.6.1 may be located on the Internet using the following
+Python 1.6.1 may be located on the internet using the following
unique, persistent identifier (known as a handle): 1895.22/1013. This
-Agreement may also be obtained from a proxy server on the Internet
+Agreement may also be obtained from a proxy server on the internet
using the following URL: http://hdl.handle.net/1895.22/1013".
3. In the event Licensee prepares a derivative work that is based on
@@ -277,3 +263,17 @@ FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION
+----------------------------------------------------------------------
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
diff --git a/LICENSES/PYPERCLIP_LICENSE b/LICENSES/PYPERCLIP_LICENSE
new file mode 100644
index 0000000000000..07cc746cd5ad6
--- /dev/null
+++ b/LICENSES/PYPERCLIP_LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2014, Al Sweigart
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/LICENSES/PYUPGRADE_LICENSE b/LICENSES/PYUPGRADE_LICENSE
index 522fbe20b8991..edeac73dade04 100644
--- a/LICENSES/PYUPGRADE_LICENSE
+++ b/LICENSES/PYUPGRADE_LICENSE
@@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+THE SOFTWARE.
\ No newline at end of file
diff --git a/LICENSES/SAS7BDAT_LICENSE b/LICENSES/SAS7BDAT_LICENSE
index 8fbf194013e93..94b7fe934e85c 100644
--- a/LICENSES/SAS7BDAT_LICENSE
+++ b/LICENSES/SAS7BDAT_LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2015 Jared Hobbs
+Copyright (c) 2015-2019 Jared Hobbs
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
\ No newline at end of file
diff --git a/LICENSES/SCIPY_LICENSE b/LICENSES/SCIPY_LICENSE
deleted file mode 100644
index d887ce5f9890f..0000000000000
--- a/LICENSES/SCIPY_LICENSE
+++ /dev/null
@@ -1,31 +0,0 @@
-Copyright (c) 2001, 2002 Enthought, Inc.
-All rights reserved.
-
-Copyright (c) 2003-2012 SciPy Developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- a. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- b. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- c. Neither the name of Enthought nor the names of the SciPy Developers
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGE.
-
diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE
index a905fb017d813..58fc9dfc0a35b 100644
--- a/LICENSES/ULTRAJSON_LICENSE
+++ b/LICENSES/ULTRAJSON_LICENSE
@@ -1,21 +1,22 @@
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+Developed by ESN, an Electronic Arts Inc. studio.
+Copyright (c) 2014, Electronic Arts Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the ESN Social Software AB nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of ESN, Electronic Arts Inc. nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
+DISCLAIMED. IN NO EVENT SHALL ELECTRONIC ARTS INC. BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -23,12 +24,91 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+----
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
-Numeric decoder derived from TCL library
-http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+ Copyright 2005, 2006, 2007
+ Nick Galbreath -- nickg [at] modp [dot] com
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ Neither the name of the modp.com nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This is the standard "new" BSD license:
+ http://www.opensource.org/licenses/bsd-license.php
+
+https://github.com/client9/stringencoders/blob/cfd5c1507325ae497ea9bacdacba12c0ffd79d30/COPYING
+
+----
+
+Numeric decoder derived from from TCL library
+https://opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
* Copyright (c) 1988-1993 The Regents of the University of California.
* Copyright (c) 1994 Sun Microsystems, Inc.
+
+ This software is copyrighted by the Regents of the University of
+ California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+ Corporation and other parties. The following terms apply to all files
+ associated with the software unless explicitly disclaimed in
+ individual files.
+
+ The authors hereby grant permission to use, copy, modify, distribute,
+ and license this software and its documentation for any purpose, provided
+ that existing copyright notices are retained in all copies and that this
+ notice is included verbatim in any distributions. No written agreement,
+ license, or royalty fee is required for any of the authorized uses.
+ Modifications to this software may be copyrighted by their authors
+ and need not follow the licensing terms described here, provided that
+ the new terms are clearly indicated on the first page of each file where
+ they apply.
+
+ IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+ FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+ DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+ THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
+ IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+ NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ MODIFICATIONS.
+
+ GOVERNMENT USE: If you are acquiring this software on behalf of the
+ U.S. government, the Government shall have only "Restricted Rights"
+ in the software and related documentation as defined in the Federal
+ Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
+ are acquiring the software on behalf of the Department of Defense, the
+ software shall be classified as "Commercial Computer Software" and the
+ Government shall have only "Restricted Rights" as defined in Clause
+ 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
+ authors grant the U.S. Government and others acting in its behalf
+ permission to use and distribute the software in accordance with the
+ terms specified in this license.
\ No newline at end of file
diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE
index 6bafeb9d3d80e..8405e89a0b120 100644
--- a/LICENSES/XARRAY_LICENSE
+++ b/LICENSES/XARRAY_LICENSE
@@ -1,7 +1,3 @@
-Copyright 2014-2019, xarray Developers
-
---------------------------------------------------------------------------------
-
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
@@ -192,4 +188,4 @@ third-party archives.
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
- limitations under the License.
+ limitations under the License.
\ No newline at end of file
diff --git a/README.md b/README.md
index 8ea473beb107e..6fa20d237babe 100644
--- a/README.md
+++ b/README.md
@@ -130,23 +130,17 @@ In the `pandas` directory (same one where you found this file after
cloning the git repo), execute:
```sh
-python setup.py install
+pip install .
```
or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable):
```sh
-python -m pip install -e . --no-build-isolation --no-use-pep517
+python -m pip install -ve . --no-build-isolation --config-settings=editable-verbose=true
```
-or alternatively
-
-```sh
-python setup.py develop
-```
-
-See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source).
+See the full instructions for [installing from source](https://pandas.pydata.org/docs/dev/development/contributing_environment.html).
## License
[BSD 3](LICENSE)
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index 810764754b7e1..9b0dc14fe6747 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -41,7 +41,7 @@
// pip (with all the conda available packages installed first,
// followed by the pip installed packages).
"matrix": {
- "Cython": ["0.29.33"],
+ "Cython": ["3.0.5"],
"matplotlib": [],
"sqlalchemy": [],
"scipy": [],
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 2584e1f13853a..933e8fbc175d8 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -4,8 +4,6 @@
import pandas as pd
-from .pandas_vb_common import tm
-
for imp in ["pandas.util", "pandas.tools.hashing"]:
try:
hashing = import_module(imp)
@@ -19,9 +17,9 @@ class Factorize:
[True, False],
[True, False],
[
- "int",
- "uint",
- "float",
+ "int64",
+ "uint64",
+ "float64",
"object",
"object_str",
"datetime64[ns]",
@@ -35,28 +33,27 @@ class Factorize:
def setup(self, unique, sort, dtype):
N = 10**5
- string_index = tm.makeStringIndex(N)
- string_arrow = None
- if dtype == "string[pyarrow]":
- try:
- string_arrow = pd.array(string_index, dtype="string[pyarrow]")
- except ImportError:
- raise NotImplementedError
-
- data = {
- "int": pd.Index(np.arange(N), dtype="int64"),
- "uint": pd.Index(np.arange(N), dtype="uint64"),
- "float": pd.Index(np.random.randn(N), dtype="float64"),
- "object_str": string_index,
- "object": pd.Index(np.arange(N), dtype="object"),
- "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
- "datetime64[ns, tz]": pd.date_range(
- "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
- ),
- "Int64": pd.array(np.arange(N), dtype="Int64"),
- "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
- "string[pyarrow]": string_arrow,
- }[dtype]
+
+ if dtype in ["int64", "uint64", "Int64", "object"]:
+ data = pd.Index(np.arange(N), dtype=dtype)
+ elif dtype == "float64":
+ data = pd.Index(np.random.randn(N), dtype=dtype)
+ elif dtype == "boolean":
+ data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
+ elif dtype == "datetime64[ns]":
+ data = pd.date_range("2011-01-01", freq="h", periods=N)
+ elif dtype == "datetime64[ns, tz]":
+ data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
+ elif dtype == "object_str":
+ data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
+ elif dtype == "string[pyarrow]":
+ data = pd.array(
+ pd.Index([f"i-{i}" for i in range(N)], dtype=object),
+ dtype="string[pyarrow]",
+ )
+ else:
+ raise NotImplementedError
+
if not unique:
data = data.repeat(5)
self.data = data
@@ -72,22 +69,35 @@ class Duplicated:
params = [
[True, False],
["first", "last", False],
- ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
+ [
+ "int64",
+ "uint64",
+ "float64",
+ "string",
+ "datetime64[ns]",
+ "datetime64[ns, tz]",
+ "timestamp[ms][pyarrow]",
+ "duration[s][pyarrow]",
+ ],
]
param_names = ["unique", "keep", "dtype"]
def setup(self, unique, keep, dtype):
N = 10**5
- data = {
- "int": pd.Index(np.arange(N), dtype="int64"),
- "uint": pd.Index(np.arange(N), dtype="uint64"),
- "float": pd.Index(np.random.randn(N), dtype="float64"),
- "string": tm.makeStringIndex(N),
- "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
- "datetime64[ns, tz]": pd.date_range(
- "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
- ),
- }[dtype]
+ if dtype in ["int64", "uint64"]:
+ data = pd.Index(np.arange(N), dtype=dtype)
+ elif dtype == "float64":
+ data = pd.Index(np.random.randn(N), dtype="float64")
+ elif dtype == "string":
+ data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
+ elif dtype == "datetime64[ns]":
+ data = pd.date_range("2011-01-01", freq="h", periods=N)
+ elif dtype == "datetime64[ns, tz]":
+ data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
+ elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]:
+ data = pd.Index(np.arange(N), dtype=dtype)
+ else:
+ raise NotImplementedError
if not unique:
data = data.repeat(5)
self.idx = data
@@ -127,7 +137,9 @@ def setup_cache(self):
df = pd.DataFrame(
{
"strings": pd.Series(
- tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
+ pd.Index([f"i-{i}" for i in range(10000)], dtype=object).take(
+ np.random.randint(0, 10000, size=N)
+ )
),
"floats": np.random.randn(N),
"ints": np.arange(N),
@@ -165,21 +177,22 @@ class Quantile:
params = [
[0, 0.5, 1],
["linear", "nearest", "lower", "higher", "midpoint"],
- ["float", "int", "uint"],
+ ["float64", "int64", "uint64"],
]
param_names = ["quantile", "interpolation", "dtype"]
def setup(self, quantile, interpolation, dtype):
N = 10**5
- data = {
- "int": np.arange(N),
- "uint": np.arange(N).astype(np.uint64),
- "float": np.random.randn(N),
- }
- self.idx = pd.Series(data[dtype].repeat(5))
+ if dtype in ["int64", "uint64"]:
+ data = np.arange(N, dtype=dtype)
+ elif dtype == "float64":
+ data = np.random.randn(N)
+ else:
+ raise NotImplementedError
+ self.ser = pd.Series(data.repeat(5))
def time_quantile(self, quantile, interpolation, dtype):
- self.idx.quantile(quantile, interpolation=interpolation)
+ self.ser.quantile(quantile, interpolation=interpolation)
class SortIntegerArray:
diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py
index ac79ab65cea81..2b3d32fb579dc 100644
--- a/asv_bench/benchmarks/algos/isin.py
+++ b/asv_bench/benchmarks/algos/isin.py
@@ -8,8 +8,6 @@
date_range,
)
-from ..pandas_vb_common import tm
-
class IsIn:
params = [
@@ -60,7 +58,9 @@ def setup(self, dtype):
elif dtype in ["str", "string[python]", "string[pyarrow]"]:
try:
- self.series = Series(tm.makeStringIndex(N), dtype=dtype)
+ self.series = Series(
+ Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype
+ )
except ImportError:
raise NotImplementedError
self.values = list(self.series[:2])
@@ -247,7 +247,7 @@ def setup(self, series_type, vals_type):
elif series_type == "long":
ser_vals = np.arange(N_many)
elif series_type == "long_floats":
- ser_vals = np.arange(N_many, dtype=np.float_)
+ ser_vals = np.arange(N_many, dtype=np.float64)
self.series = Series(ser_vals).astype(object)
@@ -258,7 +258,7 @@ def setup(self, series_type, vals_type):
elif vals_type == "long":
values = np.arange(N_many)
elif vals_type == "long_floats":
- values = np.arange(N_many, dtype=np.float_)
+ values = np.arange(N_many, dtype=np.float64)
self.values = values.astype(object)
diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
index 4fd9740f184c8..6b1f75187f887 100644
--- a/asv_bench/benchmarks/arithmetic.py
+++ b/asv_bench/benchmarks/arithmetic.py
@@ -6,13 +6,12 @@
import pandas as pd
from pandas import (
DataFrame,
+ Index,
Series,
Timestamp,
date_range,
to_timedelta,
)
-import pandas._testing as tm
-from pandas.core.algorithms import checked_add_with_arr
from .pandas_vb_common import numeric_dtypes
@@ -262,7 +261,7 @@ class Timeseries:
def setup(self, tz):
N = 10**6
halfway = (N // 2) - 1
- self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz))
+ self.s = Series(date_range("20010101", periods=N, freq="min", tz=tz))
self.ts = self.s[halfway]
self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
@@ -323,8 +322,10 @@ class IndexArithmetic:
def setup(self, dtype):
N = 10**6
- indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"}
- self.index = getattr(tm, indexes[dtype])(N)
+ if dtype == "float":
+ self.index = Index(np.arange(N), dtype=np.float64)
+ elif dtype == "int":
+ self.index = Index(np.arange(N), dtype=np.int64)
def time_add(self, dtype):
self.index + 2
@@ -387,42 +388,6 @@ def time_add_timedeltas(self, df):
df["timedelta"] + df["timedelta"]
-class AddOverflowScalar:
- params = [1, -1, 0]
- param_names = ["scalar"]
-
- def setup(self, scalar):
- N = 10**6
- self.arr = np.arange(N)
-
- def time_add_overflow_scalar(self, scalar):
- checked_add_with_arr(self.arr, scalar)
-
-
-class AddOverflowArray:
- def setup(self):
- N = 10**6
- self.arr = np.arange(N)
- self.arr_rev = np.arange(-N, 0)
- self.arr_mixed = np.array([1, -1]).repeat(N / 2)
- self.arr_nan_1 = np.random.choice([True, False], size=N)
- self.arr_nan_2 = np.random.choice([True, False], size=N)
-
- def time_add_overflow_arr_rev(self):
- checked_add_with_arr(self.arr, self.arr_rev)
-
- def time_add_overflow_arr_mask_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
-
- def time_add_overflow_b_mask_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1)
-
- def time_add_overflow_both_arg_nan(self):
- checked_add_with_arr(
- self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2
- )
-
-
hcal = pd.tseries.holiday.USFederalHolidayCalendar()
# These offsets currently raise a NotImplementedError with .apply_index()
non_apply = [
@@ -460,7 +425,7 @@ class OffsetArrayArithmetic:
def setup(self, offset):
N = 10000
- rng = date_range(start="1/1/2000", periods=N, freq="T")
+ rng = date_range(start="1/1/2000", periods=N, freq="min")
self.rng = rng
self.ser = Series(rng)
@@ -479,7 +444,7 @@ class ApplyIndex:
def setup(self, offset):
N = 10000
- rng = date_range(start="1/1/2000", periods=N, freq="T")
+ rng = date_range(start="1/1/2000", periods=N, freq="min")
self.rng = rng
def time_apply_index(self, offset):
@@ -491,7 +456,7 @@ class BinaryOpsMultiIndex:
param_names = ["func"]
def setup(self, func):
- array = date_range("20200101 00:00", "20200102 0:00", freq="S")
+ array = date_range("20200101 00:00", "20200102 0:00", freq="s")
level_0_names = [str(i) for i in range(30)]
index = pd.MultiIndex.from_product([level_0_names, array])
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index 09c4acc0ab309..aefc6e6d3c307 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -31,9 +31,9 @@ def time_from_float_array(self):
class IntegerArray:
def setup(self):
N = 250_000
- self.values_integer = np.array([1, 0, 1, 0] * N)
- self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
- self.mask = np.array([False, False, True, False] * N)
+ self.values_integer = np.tile(np.array([1, 0, 1, 0]), N)
+ self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N)
+ self.mask = np.tile(np.array([False, False, True, False]), N)
def time_constructor(self):
pd.arrays.IntegerArray(self.data, self.mask)
@@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks):
self.array[i] = "foo"
def time_setitem_list(self, multiple_chunks):
- indexer = list(range(0, 50)) + list(range(-1000, 0, 50))
+ indexer = list(range(50)) + list(range(-1000, 0, 50))
self.array[indexer] = ["foo"] * len(indexer)
def time_setitem_slice(self, multiple_chunks):
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 84d4a28d675d5..1716110b619d6 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -6,8 +6,6 @@
import pandas as pd
-from .pandas_vb_common import tm
-
try:
from pandas.api.types import union_categoricals
except ImportError:
@@ -189,7 +187,7 @@ def setup(self):
N = 10**5
ncats = 15
- self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
+ self.s_str = pd.Series(np.random.randint(0, ncats, size=N).astype(str))
self.s_str_cat = pd.Series(self.s_str, dtype="category")
with warnings.catch_warnings(record=True):
str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
@@ -242,7 +240,7 @@ def time_categorical_series_is_monotonic_decreasing(self):
class Contains:
def setup(self):
N = 10**5
- self.ci = tm.makeCategoricalIndex(N)
+ self.ci = pd.CategoricalIndex(np.arange(N))
self.c = self.ci.values
self.key = self.ci.categories[0]
@@ -260,18 +258,16 @@ class CategoricalSlicing:
def setup(self, index):
N = 10**6
categories = ["a", "b", "c"]
- values = [0] * N + [1] * N + [2] * N
if index == "monotonic_incr":
- self.data = pd.Categorical.from_codes(values, categories=categories)
+ codes = np.repeat([0, 1, 2], N)
elif index == "monotonic_decr":
- self.data = pd.Categorical.from_codes(
- list(reversed(values)), categories=categories
- )
+ codes = np.repeat([2, 1, 0], N)
elif index == "non_monotonic":
- self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
+ codes = np.tile([0, 1, 2], N)
else:
raise ValueError(f"Invalid index param: {index}")
+ self.data = pd.Categorical.from_codes(codes, categories=categories)
self.scalar = 10000
self.list = list(range(10000))
self.cat_scalar = "b"
@@ -327,7 +323,7 @@ def time_sort_values(self):
class SearchSorted:
def setup(self):
N = 10**5
- self.ci = tm.makeCategoricalIndex(N).sort_values()
+ self.ci = pd.CategoricalIndex(np.arange(N)).sort_values()
self.c = self.ci.values
self.key = self.ci.categories[1]
diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py
index 2db00cc7f2ad9..77c9faf3d3a87 100644
--- a/asv_bench/benchmarks/ctors.py
+++ b/asv_bench/benchmarks/ctors.py
@@ -9,8 +9,6 @@
date_range,
)
-from .pandas_vb_common import tm
-
def no_change(arr):
return arr
@@ -115,7 +113,7 @@ def time_dtindex_from_index_with_series(self):
class MultiIndexConstructor:
def setup(self):
N = 10**4
- self.iterables = [tm.makeStringIndex(N), range(20)]
+ self.iterables = [Index([f"i-{i}" for i in range(N)], dtype=object), range(20)]
def time_multiindex_from_iterables(self):
MultiIndex.from_product(self.iterables)
diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py
index c33043c0eddc1..7f3429b5e3882 100644
--- a/asv_bench/benchmarks/dtypes.py
+++ b/asv_bench/benchmarks/dtypes.py
@@ -3,7 +3,10 @@
import numpy as np
import pandas as pd
-from pandas import DataFrame
+from pandas import (
+ DataFrame,
+ Index,
+)
import pandas._testing as tm
from pandas.api.types import (
is_extension_array_dtype,
@@ -73,8 +76,8 @@ class SelectDtypes:
def setup(self, dtype):
N, K = 5000, 50
- self.index = tm.makeStringIndex(N)
- self.columns = tm.makeStringIndex(K)
+ self.index = Index([f"i-{i}" for i in range(N)], dtype=object)
+ self.columns = Index([f"i-{i}" for i in range(K)], dtype=object)
def create_df(data):
return DataFrame(data, index=self.index, columns=self.columns)
diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py
index 8a3d224c59a09..656d16a910a9f 100644
--- a/asv_bench/benchmarks/eval.py
+++ b/asv_bench/benchmarks/eval.py
@@ -44,7 +44,7 @@ class Query:
def setup(self):
N = 10**6
halfway = (N // 2) - 1
- index = pd.date_range("20010101", periods=N, freq="T")
+ index = pd.date_range("20010101", periods=N, freq="min")
s = pd.Series(index)
self.ts = s.iloc[halfway]
self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index)
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
index 7092a679b8cf0..f938f7eb0d951 100644
--- a/asv_bench/benchmarks/frame_ctor.py
+++ b/asv_bench/benchmarks/frame_ctor.py
@@ -12,8 +12,6 @@
date_range,
)
-from .pandas_vb_common import tm
-
try:
from pandas.tseries.offsets import (
Hour,
@@ -30,8 +28,8 @@
class FromDicts:
def setup(self):
N, K = 5000, 50
- self.index = tm.makeStringIndex(N)
- self.columns = tm.makeStringIndex(K)
+ self.index = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
+ self.columns = pd.Index([f"i-{i}" for i in range(K)], dtype=object)
frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
self.data = frame.to_dict()
self.dict_list = frame.to_dict(orient="records")
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 96fe90ce08c5f..a283afd1f0f1e 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -5,6 +5,7 @@
from pandas import (
DataFrame,
+ Index,
MultiIndex,
NaT,
Series,
@@ -14,8 +15,6 @@
timedelta_range,
)
-from .pandas_vb_common import tm
-
class AsType:
params = [
@@ -439,9 +438,9 @@ def setup(self, inplace, dtype):
N, M = 10000, 100
if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"):
data = {
- "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N),
+ "datetime64[ns]": date_range("2011-01-01", freq="h", periods=N),
"datetime64[ns, tz]": date_range(
- "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
+ "2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
),
"timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
}
@@ -640,7 +639,8 @@ def time_frame_nunique(self):
class SeriesNuniqueWithNan:
def setup(self):
- self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
+ values = 100 * [np.nan] + list(range(100))
+ self.ser = Series(np.tile(values, 10000), dtype=float)
def time_series_nunique_nan(self):
self.ser.nunique()
@@ -649,7 +649,7 @@ def time_series_nunique_nan(self):
class Duplicated:
def setup(self):
n = 1 << 20
- t = date_range("2015-01-01", freq="S", periods=(n // 64))
+ t = date_range("2015-01-01", freq="s", periods=(n // 64))
xs = np.random.randn(n // 64).round(2)
self.df = DataFrame(
{
@@ -693,20 +693,34 @@ def time_frame_sort_values(self, ascending):
self.df.sort_values(by="A", ascending=ascending)
-class SortIndexByColumns:
- def setup(self):
+class SortMultiKey:
+ params = [True, False]
+ param_names = ["monotonic"]
+
+ def setup(self, monotonic):
N = 10000
K = 10
- self.df = DataFrame(
+ df = DataFrame(
{
- "key1": tm.makeStringIndex(N).values.repeat(K),
- "key2": tm.makeStringIndex(N).values.repeat(K),
+ "key1": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(
+ K
+ ),
+ "key2": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(
+ K
+ ),
"value": np.random.randn(N * K),
}
)
+ if monotonic:
+ df = df.sort_values(["key1", "key2"])
+ self.df_by_columns = df
+ self.df_by_index = df.set_index(["key1", "key2"])
+
+ def time_sort_values(self, monotonic):
+ self.df_by_columns.sort_values(by=["key1", "key2"])
- def time_frame_sort_values_by_columns(self):
- self.df.sort_values(by=["key1", "key2"])
+ def time_sort_index(self, monotonic):
+ self.df_by_index.sort_index()
class Quantile:
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
index 4d5c31d2dddf8..a0c4189c72d0e 100644
--- a/asv_bench/benchmarks/gil.py
+++ b/asv_bench/benchmarks/gil.py
@@ -5,6 +5,7 @@
from pandas import (
DataFrame,
+ Index,
Series,
date_range,
factorize,
@@ -12,8 +13,6 @@
)
from pandas.core.algorithms import take_nd
-from .pandas_vb_common import tm
-
try:
from pandas import (
rolling_kurt,
@@ -34,7 +33,6 @@
except ImportError:
from pandas import algos
-
from .pandas_vb_common import BaseIO # isort:skip
@@ -178,7 +176,7 @@ def time_kth_smallest(self):
class ParallelDatetimeFields:
def setup(self):
N = 10**6
- self.dti = date_range("1900-01-01", periods=N, freq="T")
+ self.dti = date_range("1900-01-01", periods=N, freq="min")
self.period = self.dti.to_period("D")
def time_datetime_field_year(self):
@@ -212,7 +210,7 @@ def run(dti):
def time_datetime_to_period(self):
@test_parallel(num_threads=2)
def run(dti):
- dti.to_period("S")
+ dti.to_period("s")
run(self.dti)
@@ -272,18 +270,20 @@ class ParallelReadCSV(BaseIO):
def setup(self, dtype):
rows = 10000
cols = 50
- data = {
- "float": DataFrame(np.random.randn(rows, cols)),
- "datetime": DataFrame(
+ if dtype == "float":
+ df = DataFrame(np.random.randn(rows, cols))
+ elif dtype == "datetime":
+ df = DataFrame(
np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
- ),
- "object": DataFrame(
+ )
+ elif dtype == "object":
+ df = DataFrame(
"foo", index=range(rows), columns=["object%03d" for _ in range(5)]
- ),
- }
+ )
+ else:
+ raise NotImplementedError
self.fname = f"__test_{dtype}__.csv"
- df = data[dtype]
df.to_csv(self.fname)
@test_parallel(num_threads=2)
@@ -303,7 +303,7 @@ class ParallelFactorize:
param_names = ["threads"]
def setup(self, threads):
- strings = tm.makeStringIndex(100000)
+ strings = Index([f"i-{i}" for i in range(100000)], dtype=object)
@test_parallel(num_threads=threads)
def parallel():
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index b206523dfe851..abffa1f702b9c 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -17,8 +17,6 @@
to_timedelta,
)
-from .pandas_vb_common import tm
-
method_blocklist = {
"object": {
"diff",
@@ -73,6 +71,8 @@
"ffill",
"first",
"head",
+ "idxmax",
+ "idxmin",
"last",
"median",
"nunique",
@@ -165,10 +165,14 @@ def setup_cache(self):
"int64_small": Series(np.random.randint(0, 100, size=size)),
"int64_large": Series(np.random.randint(0, 10000, size=size)),
"object_small": Series(
- tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))
+ Index([f"i-{i}" for i in range(100)], dtype=object).take(
+ np.random.randint(0, 100, size=size)
+ )
),
"object_large": Series(
- tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))
+ Index([f"i-{i}" for i in range(10000)], dtype=object).take(
+ np.random.randint(0, 10000, size=size)
+ )
),
}
return data
@@ -236,7 +240,7 @@ def time_series_nth(self, dtype):
class DateAttributes:
def setup(self):
- rng = date_range("1/1/2000", "12/31/2005", freq="H")
+ rng = date_range("1/1/2000", "12/31/2005", freq="h")
self.year, self.month, self.day = rng.year, rng.month, rng.day
self.ts = Series(np.random.randn(len(rng)), index=rng)
@@ -588,6 +592,8 @@ class GroupByCythonAgg:
"prod",
"min",
"max",
+ "idxmin",
+ "idxmax",
"mean",
"median",
"var",
@@ -709,7 +715,7 @@ def setup(self, dtype, tie_method):
if dtype == "datetime64":
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
else:
- data = np.array([1] * N, dtype=dtype)
+ data = np.ones(N, dtype=dtype)
self.df = DataFrame({"values": data, "key": ["foo"] * N})
def time_rank_ties(self, dtype, tie_method):
@@ -798,6 +804,51 @@ def time_groupby_extra_cat_nosort(self, observed):
self.df_extra_cat.groupby("a", observed=observed, sort=False)["b"].count()
+class MultipleCategories:
+ def setup(self):
+ N = 10**3
+ arr = np.random.random(N)
+ data = {
+ "a1": Categorical(np.random.randint(10000, size=N)),
+ "a2": Categorical(np.random.randint(10000, size=N)),
+ "b": arr,
+ }
+ self.df = DataFrame(data)
+ data = {
+ "a1": Categorical(np.random.randint(10000, size=N), ordered=True),
+ "a2": Categorical(np.random.randint(10000, size=N), ordered=True),
+ "b": arr,
+ }
+ self.df_ordered = DataFrame(data)
+ data = {
+ "a1": Categorical(np.random.randint(100, size=N), categories=np.arange(N)),
+ "a2": Categorical(np.random.randint(100, size=N), categories=np.arange(N)),
+ "b": arr,
+ }
+ self.df_extra_cat = DataFrame(data)
+
+ def time_groupby_sort(self):
+ self.df.groupby(["a1", "a2"], observed=False)["b"].count()
+
+ def time_groupby_nosort(self):
+ self.df.groupby(["a1", "a2"], observed=False, sort=False)["b"].count()
+
+ def time_groupby_ordered_sort(self):
+ self.df_ordered.groupby(["a1", "a2"], observed=False)["b"].count()
+
+ def time_groupby_ordered_nosort(self):
+ self.df_ordered.groupby(["a1", "a2"], observed=False, sort=False)["b"].count()
+
+ def time_groupby_extra_cat_sort(self):
+ self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].count()
+
+ def time_groupby_extra_cat_nosort(self):
+ self.df_extra_cat.groupby(["a1", "a2"], observed=False, sort=False)["b"].count()
+
+ def time_groupby_transform(self):
+ self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].cumsum()
+
+
class Datelike:
# GH 14338
params = ["period_range", "date_range", "date_range_tz"]
@@ -841,12 +892,29 @@ def time_groupby_sum_multiindex(self):
self.df.groupby(level=[0, 1]).sum()
+class SumTimeDelta:
+ # GH 20660
+ def setup(self):
+ N = 10**4
+ self.df = DataFrame(
+ np.random.randint(1000, 100000, (N, 100)),
+ index=np.random.randint(200, size=(N,)),
+ ).astype("timedelta64[ns]")
+ self.df_int = self.df.copy().astype("int64")
+
+ def time_groupby_sum_timedelta(self):
+ self.df.groupby(lambda x: x).sum()
+
+ def time_groupby_sum_int(self):
+ self.df_int.groupby(lambda x: x).sum()
+
+
class Transform:
def setup(self):
n1 = 400
n2 = 250
index = MultiIndex(
- levels=[np.arange(n1), tm.makeStringIndex(n2)],
+ levels=[np.arange(n1), Index([f"i-{i}" for i in range(n2)], dtype=object)],
codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1],
names=["lev1", "lev2"],
)
diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py
index b3d8de39a858a..d21bbe15c4cc8 100644
--- a/asv_bench/benchmarks/index_cached_properties.py
+++ b/asv_bench/benchmarks/index_cached_properties.py
@@ -25,14 +25,14 @@ def setup(self, index_type):
N = 10**5
if index_type == "MultiIndex":
self.idx = pd.MultiIndex.from_product(
- [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]]
+ [pd.date_range("1/1/2000", freq="min", periods=N // 2), ["a", "b"]]
)
elif index_type == "DatetimeIndex":
- self.idx = pd.date_range("1/1/2000", freq="T", periods=N)
+ self.idx = pd.date_range("1/1/2000", freq="min", periods=N)
elif index_type == "Int64Index":
self.idx = pd.Index(range(N), dtype="int64")
elif index_type == "PeriodIndex":
- self.idx = pd.period_range("1/1/2000", freq="T", periods=N)
+ self.idx = pd.period_range("1/1/2000", freq="min", periods=N)
elif index_type == "RangeIndex":
self.idx = pd.RangeIndex(start=0, stop=N)
elif index_type == "IntervalIndex":
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
index bdc8a6a7aa1df..637b1b40f59a3 100644
--- a/asv_bench/benchmarks/index_object.py
+++ b/asv_bench/benchmarks/index_object.py
@@ -12,8 +12,6 @@
date_range,
)
-from .pandas_vb_common import tm
-
class SetOperations:
params = (
@@ -25,12 +23,12 @@ class SetOperations:
def setup(self, index_structure, dtype, method):
N = 10**5
- dates_left = date_range("1/1/2000", periods=N, freq="T")
+ dates_left = date_range("1/1/2000", periods=N, freq="min")
fmt = "%Y-%m-%d %H:%M:%S"
date_str_left = Index(dates_left.strftime(fmt))
int_left = Index(np.arange(N))
ea_int_left = Index(np.arange(N), dtype="Int64")
- str_left = tm.makeStringIndex(N)
+ str_left = Index([f"i-{i}" for i in range(N)], dtype=object)
data = {
"datetime": dates_left,
@@ -155,15 +153,18 @@ class Indexing:
def setup(self, dtype):
N = 10**6
- self.idx = getattr(tm, f"make{dtype}Index")(N)
+ if dtype == "String":
+ self.idx = Index([f"i-{i}" for i in range(N)], dtype=object)
+ elif dtype == "Float":
+ self.idx = Index(np.arange(N), dtype=np.float64)
+ elif dtype == "Int":
+ self.idx = Index(np.arange(N), dtype=np.int64)
self.array_mask = (np.arange(N) % 3) == 0
self.series_mask = Series(self.array_mask)
self.sorted = self.idx.sort_values()
half = N // 2
self.non_unique = self.idx[:half].append(self.idx[:half])
- self.non_unique_sorted = (
- self.sorted[:half].append(self.sorted[:half]).sort_values()
- )
+ self.non_unique_sorted = self.sorted[:half].repeat(2)
self.key = self.sorted[N // 4]
def time_boolean_array(self, dtype):
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 84d95a23bd446..9ad1f5b31016d 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -22,8 +22,6 @@
period_range,
)
-from .pandas_vb_common import tm
-
class NumericSeriesIndexing:
params = [
@@ -124,7 +122,7 @@ class NonNumericSeriesIndexing:
def setup(self, index, index_structure):
N = 10**6
if index == "string":
- index = tm.makeStringIndex(N)
+ index = Index([f"i-{i}" for i in range(N)], dtype=object)
elif index == "datetime":
index = date_range("1900", periods=N, freq="s")
elif index == "period":
@@ -156,8 +154,8 @@ def time_getitem_list_like(self, index, index_structure):
class DataFrameStringIndexing:
def setup(self):
- index = tm.makeStringIndex(1000)
- columns = tm.makeStringIndex(30)
+ index = Index([f"i-{i}" for i in range(1000)], dtype=object)
+ columns = Index([f"i-{i}" for i in range(30)], dtype=object)
with warnings.catch_warnings(record=True):
self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns)
self.idx_scalar = index[100]
@@ -232,7 +230,7 @@ def setup(self, index):
N = 100000
indexes = {
"int": Index(np.arange(N), dtype=np.int64),
- "datetime": date_range("2011-01-01", freq="S", periods=N),
+ "datetime": date_range("2011-01-01", freq="s", periods=N),
}
index = indexes[index]
self.s = Series(np.random.rand(N), index=index)
@@ -306,6 +304,10 @@ def time_loc_null_slice_plus_slice(self, unique_levels):
target = (self.tgt_null_slice, self.tgt_slice)
self.df.loc[target, :]
+ def time_loc_multiindex(self, unique_levels):
+ target = self.df.index[::10]
+ self.df.loc[target]
+
def time_xs_level_0(self, unique_levels):
target = self.tgt_scalar
self.df.xs(target, level=0)
@@ -465,7 +467,7 @@ def time_loc_row(self, unique_cols):
class AssignTimeseriesIndex:
def setup(self):
N = 100000
- idx = date_range("1/1/2000", periods=N, freq="H")
+ idx = date_range("1/1/2000", periods=N, freq="h")
self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx)
def time_frame_assign_timeseries_index(self):
@@ -515,6 +517,18 @@ def time_setitem_list(self):
self.df[[100, 200, 300]] = 100
+class SetitemObjectDtype:
+ # GH#19299
+
+ def setup(self):
+ N = 1000
+ cols = 500
+ self.df = DataFrame(index=range(N), columns=range(cols), dtype=object)
+
+ def time_setitem_object_dtype(self):
+ self.df.loc[0, 1] = 1.0
+
+
class ChainIndexing:
params = [None, "warn"]
param_names = ["mode"]
diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
index 6585a4be78dc6..fd3d0f0b9cf2e 100644
--- a/asv_bench/benchmarks/indexing_engines.py
+++ b/asv_bench/benchmarks/indexing_engines.py
@@ -71,14 +71,12 @@ def setup(self, engine_and_dtype, index_type, unique, N):
if unique:
arr = np.arange(N * 3, dtype=dtype)
else:
- values = list([1] * N + [2] * N + [3] * N)
- arr = np.array(values, dtype=dtype)
+ arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
elif index_type == "monotonic_decr":
if unique:
arr = np.arange(N * 3, dtype=dtype)[::-1]
else:
- values = list([1] * N + [2] * N + [3] * N)
- arr = np.array(values, dtype=dtype)[::-1]
+ arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
else:
assert index_type == "non_monotonic"
if unique:
@@ -86,7 +84,7 @@ def setup(self, engine_and_dtype, index_type, unique, N):
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
arr[N:] = np.arange(N * 2, dtype=dtype)
else:
- arr = np.array([1, 2, 3] * N, dtype=dtype)
+ arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
self.data = engine(arr)
# code belows avoids populating the mapping etc. while timing.
@@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing:
def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype
+ dtype = dtype.lower()
if index_type == "monotonic_incr":
if unique:
- arr = np.arange(N * 3, dtype=dtype.lower())
+ arr = np.arange(N * 3, dtype=dtype)
else:
- values = list([1] * N + [2] * N + [3] * N)
- arr = np.array(values, dtype=dtype.lower())
+ arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
elif index_type == "monotonic_decr":
if unique:
- arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
+ arr = np.arange(N * 3, dtype=dtype)[::-1]
else:
- values = list([1] * N + [2] * N + [3] * N)
- arr = np.array(values, dtype=dtype.lower())[::-1]
+ arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
else:
assert index_type == "non_monotonic"
if unique:
- arr = np.zeros(N * 3, dtype=dtype.lower())
- arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
- arr[N:] = np.arange(N * 2, dtype=dtype.lower())
+ arr = np.zeros(N * 3, dtype=dtype)
+ arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
+ arr[N:] = np.arange(N * 2, dtype=dtype)
else:
- arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
+ arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
mask[-1] = True
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
index 476ff14dcc92a..d5c58033c1157 100644
--- a/asv_bench/benchmarks/inference.py
+++ b/asv_bench/benchmarks/inference.py
@@ -9,6 +9,7 @@
import numpy as np
from pandas import (
+ Index,
NaT,
Series,
date_range,
@@ -17,10 +18,7 @@
to_timedelta,
)
-from .pandas_vb_common import (
- lib,
- tm,
-)
+from .pandas_vb_common import lib
class ToNumeric:
@@ -31,7 +29,7 @@ def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
self.numstr = self.float.astype("str")
- self.str = Series(tm.makeStringIndex(N))
+ self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object))
def time_from_float(self, errors):
to_numeric(self.float, errors=errors)
@@ -164,7 +162,7 @@ def time_unique_date_strings(self, cache, count):
class ToDatetimeISO8601:
def setup(self):
- rng = date_range(start="1/1/2000", periods=20000, freq="H")
+ rng = date_range(start="1/1/2000", periods=20000, freq="h")
self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist()
self.strings_tz_space = [
@@ -276,7 +274,7 @@ def time_dup_string_tzoffset_dates(self, cache):
# GH 43901
class ToDatetimeInferDatetimeFormat:
def setup(self):
- rng = date_range(start="1/1/2000", periods=100000, freq="H")
+ rng = date_range(start="1/1/2000", periods=100000, freq="h")
self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
def time_infer_datetime_format(self):
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index c5e3e80571e30..9ac83db4f85b9 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -10,6 +10,7 @@
from pandas import (
Categorical,
DataFrame,
+ Index,
concat,
date_range,
period_range,
@@ -17,10 +18,7 @@
to_datetime,
)
-from ..pandas_vb_common import (
- BaseIO,
- tm,
-)
+from ..pandas_vb_common import BaseIO
class ToCSV(BaseIO):
@@ -89,7 +87,7 @@ class ToCSVDatetimeIndex(BaseIO):
fname = "__test__.csv"
def setup(self):
- rng = date_range("2000", periods=100_000, freq="S")
+ rng = date_range("2000", periods=100_000, freq="s")
self.data = DataFrame({"a": 1}, index=rng)
def time_frame_date_formatting_index(self):
@@ -102,7 +100,7 @@ def time_frame_date_no_format_index(self):
class ToCSVPeriod(BaseIO):
fname = "__test__.csv"
- params = ([1000, 10000], ["D", "H"])
+ params = ([1000, 10000], ["D", "h"])
param_names = ["nobs", "freq"]
def setup(self, nobs, freq):
@@ -110,7 +108,7 @@ def setup(self, nobs, freq):
self.data = DataFrame(rng)
if freq == "D":
self.default_fmt = "%Y-%m-%d"
- elif freq == "H":
+ elif freq == "h":
self.default_fmt = "%Y-%m-%d %H:00"
def time_frame_period_formatting_default(self, nobs, freq):
@@ -130,7 +128,7 @@ def time_frame_period_formatting(self, nobs, freq):
class ToCSVPeriodIndex(BaseIO):
fname = "__test__.csv"
- params = ([1000, 10000], ["D", "H"])
+ params = ([1000, 10000], ["D", "h"])
param_names = ["nobs", "freq"]
def setup(self, nobs, freq):
@@ -138,7 +136,7 @@ def setup(self, nobs, freq):
self.data = DataFrame({"a": 1}, index=rng)
if freq == "D":
self.default_fmt = "%Y-%m-%d"
- elif freq == "H":
+ elif freq == "h":
self.default_fmt = "%Y-%m-%d %H:00"
def time_frame_period_formatting_index(self, nobs, freq):
@@ -253,7 +251,7 @@ class ReadCSVConcatDatetime(StringIORewind):
iso8601 = "%Y-%m-%d %H:%M:%S"
def setup(self):
- rng = date_range("1/1/2000", periods=50000, freq="S")
+ rng = date_range("1/1/2000", periods=50000, freq="s")
self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist()))
def time_read_csv(self):
@@ -288,7 +286,7 @@ class ReadCSVSkipRows(BaseIO):
def setup(self, skiprows, engine):
N = 20000
- index = tm.makeStringIndex(N)
+ index = Index([f"i-{i}" for i in range(N)], dtype=object)
df = DataFrame(
{
"float1": np.random.randn(N),
@@ -621,4 +619,15 @@ def time_read_csv_index_col(self):
)
+class ReadCSVCParserLowMemory:
+ # GH 16798
+ def setup(self):
+ self.csv = StringIO(
+ "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])
+ )
+
+ def peakmem_over_2gb_input(self):
+ read_csv(self.csv, engine="c", low_memory=False)
+
+
from ..pandas_vb_common import setup # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
index c77c6b6f5727c..902a61be901bd 100644
--- a/asv_bench/benchmarks/io/excel.py
+++ b/asv_bench/benchmarks/io/excel.py
@@ -12,12 +12,11 @@
from pandas import (
DataFrame,
ExcelWriter,
+ Index,
date_range,
read_excel,
)
-from ..pandas_vb_common import tm
-
def _generate_dataframe():
N = 2000
@@ -25,9 +24,9 @@ def _generate_dataframe():
df = DataFrame(
np.random.randn(N, C),
columns=[f"float{i}" for i in range(C)],
- index=date_range("20000101", periods=N, freq="H"),
+ index=date_range("20000101", periods=N, freq="h"),
)
- df["object"] = tm.makeStringIndex(N)
+ df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object)
return df
diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py
index f3e417e717609..acf0ec4b9d359 100644
--- a/asv_bench/benchmarks/io/hdf.py
+++ b/asv_bench/benchmarks/io/hdf.py
@@ -3,20 +3,18 @@
from pandas import (
DataFrame,
HDFStore,
+ Index,
date_range,
read_hdf,
)
-from ..pandas_vb_common import (
- BaseIO,
- tm,
-)
+from ..pandas_vb_common import BaseIO
class HDFStoreDataFrame(BaseIO):
def setup(self):
N = 25000
- index = tm.makeStringIndex(N)
+ index = Index([f"i-{i}" for i in range(N)], dtype=object)
self.df = DataFrame(
{"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index
)
@@ -122,9 +120,9 @@ def setup(self, format):
self.df = DataFrame(
np.random.randn(N, C),
columns=[f"float{i}" for i in range(C)],
- index=date_range("20000101", periods=N, freq="H"),
+ index=date_range("20000101", periods=N, freq="h"),
)
- self.df["object"] = tm.makeStringIndex(N)
+ self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object)
self.df.to_hdf(self.fname, "df", format=format)
# Numeric df
diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py
index 9eaffddd8b87f..bcbfcdea42dd9 100644
--- a/asv_bench/benchmarks/io/json.py
+++ b/asv_bench/benchmarks/io/json.py
@@ -4,6 +4,7 @@
from pandas import (
DataFrame,
+ Index,
concat,
date_range,
json_normalize,
@@ -11,10 +12,7 @@
timedelta_range,
)
-from ..pandas_vb_common import (
- BaseIO,
- tm,
-)
+from ..pandas_vb_common import BaseIO
class ReadJSON(BaseIO):
@@ -26,7 +24,7 @@ def setup(self, orient, index):
N = 100000
indexes = {
"int": np.arange(N),
- "datetime": date_range("20000101", periods=N, freq="H"),
+ "datetime": date_range("20000101", periods=N, freq="h"),
}
df = DataFrame(
np.random.randn(N, 5),
@@ -48,7 +46,7 @@ def setup(self, index):
N = 100000
indexes = {
"int": np.arange(N),
- "datetime": date_range("20000101", periods=N, freq="H"),
+ "datetime": date_range("20000101", periods=N, freq="h"),
}
df = DataFrame(
np.random.randn(N, 5),
@@ -108,13 +106,13 @@ class ToJSON(BaseIO):
def setup(self, orient, frame):
N = 10**5
ncols = 5
- index = date_range("20000101", periods=N, freq="H")
+ index = date_range("20000101", periods=N, freq="h")
timedeltas = timedelta_range(start=1, periods=N, freq="s")
datetimes = date_range(start=1, periods=N, freq="s")
ints = np.random.randint(100000000, size=N)
longints = sys.maxsize * np.random.randint(100000000, size=N)
floats = np.random.randn(N)
- strings = tm.makeStringIndex(N)
+ strings = Index([f"i-{i}" for i in range(N)], dtype=object)
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
self.df_td_int_ts = DataFrame(
@@ -191,7 +189,7 @@ class ToJSONISO(BaseIO):
def setup(self, orient):
N = 10**5
- index = date_range("20000101", periods=N, freq="H")
+ index = date_range("20000101", periods=N, freq="h")
timedeltas = timedelta_range(start=1, periods=N, freq="s")
datetimes = date_range(start=1, periods=N, freq="s")
self.df = DataFrame(
@@ -214,13 +212,13 @@ class ToJSONLines(BaseIO):
def setup(self):
N = 10**5
ncols = 5
- index = date_range("20000101", periods=N, freq="H")
+ index = date_range("20000101", periods=N, freq="h")
timedeltas = timedelta_range(start=1, periods=N, freq="s")
datetimes = date_range(start=1, periods=N, freq="s")
ints = np.random.randint(100000000, size=N)
longints = sys.maxsize * np.random.randint(100000000, size=N)
floats = np.random.randn(N)
- strings = tm.makeStringIndex(N)
+ strings = Index([f"i-{i}" for i in range(N)], dtype=object)
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
self.df_td_int_ts = DataFrame(
@@ -290,7 +288,7 @@ def time_float_longint_str_lines(self):
class ToJSONMem:
def setup_cache(self):
df = DataFrame([[1]])
- df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="T"))
+ df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="min"))
frames = {"int": df, "float": df.astype(float), "datetime": df2}
return frames
diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py
index c71cdcdcc5c59..4787b57b54756 100644
--- a/asv_bench/benchmarks/io/pickle.py
+++ b/asv_bench/benchmarks/io/pickle.py
@@ -2,14 +2,12 @@
from pandas import (
DataFrame,
+ Index,
date_range,
read_pickle,
)
-from ..pandas_vb_common import (
- BaseIO,
- tm,
-)
+from ..pandas_vb_common import BaseIO
class Pickle(BaseIO):
@@ -20,9 +18,9 @@ def setup(self):
self.df = DataFrame(
np.random.randn(N, C),
columns=[f"float{i}" for i in range(C)],
- index=date_range("20000101", periods=N, freq="H"),
+ index=date_range("20000101", periods=N, freq="h"),
)
- self.df["object"] = tm.makeStringIndex(N)
+ self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object)
self.df.to_pickle(self.fname)
def time_read_pickle(self):
diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py
index 6f893ee72d918..e87cc4aaa80c7 100644
--- a/asv_bench/benchmarks/io/sql.py
+++ b/asv_bench/benchmarks/io/sql.py
@@ -5,13 +5,12 @@
from pandas import (
DataFrame,
+ Index,
date_range,
read_sql_query,
read_sql_table,
)
-from ..pandas_vb_common import tm
-
class SQL:
params = ["sqlalchemy", "sqlite"]
@@ -35,7 +34,7 @@ def setup(self, connection):
"int": np.random.randint(0, N, size=N),
"datetime": date_range("2000-01-01", periods=N, freq="s"),
},
- index=tm.makeStringIndex(N),
+ index=Index([f"i-{i}" for i in range(N)], dtype=object),
)
self.df.iloc[1000:3000, 1] = np.nan
self.df["date"] = self.df["datetime"].dt.date
@@ -84,7 +83,7 @@ def setup(self, connection, dtype):
"int": np.random.randint(0, N, size=N),
"datetime": date_range("2000-01-01", periods=N, freq="s"),
},
- index=tm.makeStringIndex(N),
+ index=Index([f"i-{i}" for i in range(N)], dtype=object),
)
self.df.iloc[1000:3000, 1] = np.nan
self.df["date"] = self.df["datetime"].dt.date
@@ -113,7 +112,7 @@ def setup(self):
"int": np.random.randint(0, N, size=N),
"datetime": date_range("2000-01-01", periods=N, freq="s"),
},
- index=tm.makeStringIndex(N),
+ index=Index([f"i-{i}" for i in range(N)], dtype=object),
)
self.df.iloc[1000:3000, 1] = np.nan
self.df["date"] = self.df["datetime"].dt.date
@@ -159,7 +158,7 @@ def setup(self, dtype):
"int": np.random.randint(0, N, size=N),
"datetime": date_range("2000-01-01", periods=N, freq="s"),
},
- index=tm.makeStringIndex(N),
+ index=Index([f"i-{i}" for i in range(N)], dtype=object),
)
self.df.iloc[1000:3000, 1] = np.nan
self.df["date"] = self.df["datetime"].dt.date
diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py
index 300b9c778f1f8..ff33ededdfed9 100644
--- a/asv_bench/benchmarks/io/stata.py
+++ b/asv_bench/benchmarks/io/stata.py
@@ -2,14 +2,12 @@
from pandas import (
DataFrame,
+ Index,
date_range,
read_stata,
)
-from ..pandas_vb_common import (
- BaseIO,
- tm,
-)
+from ..pandas_vb_common import BaseIO
class Stata(BaseIO):
@@ -23,9 +21,9 @@ def setup(self, convert_dates):
self.df = DataFrame(
np.random.randn(N, C),
columns=[f"float{i}" for i in range(C)],
- index=date_range("20000101", periods=N, freq="H"),
+ index=date_range("20000101", periods=N, freq="h"),
)
- self.df["object"] = tm.makeStringIndex(self.N)
+ self.df["object"] = Index([f"i-{i}" for i in range(self.N)], dtype=object)
self.df["int8_"] = np.random.randint(
np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N
)
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index 4f325335829af..ce64304731116 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -14,8 +14,6 @@
merge_asof,
)
-from .pandas_vb_common import tm
-
try:
from pandas import merge_ordered
except ImportError:
@@ -28,7 +26,7 @@ class Concat:
def setup(self, axis):
N = 1000
- s = Series(N, index=tm.makeStringIndex(N))
+ s = Series(N, index=Index([f"i-{i}" for i in range(N)], dtype=object))
self.series = [s[i:-i] for i in range(1, 10)] * 50
self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000
df = DataFrame(
@@ -94,7 +92,7 @@ def setup(self, dtype, structure, axis, sort):
elif dtype in ("int64", "Int64", "int64[pyarrow]"):
vals = np.arange(N, dtype=np.int64)
elif dtype in ("string[python]", "string[pyarrow]"):
- vals = tm.makeStringIndex(N)
+ vals = Index([f"i-{i}" for i in range(N)], dtype=object)
else:
raise NotImplementedError
@@ -122,8 +120,8 @@ class Join:
param_names = ["sort"]
def setup(self, sort):
- level1 = tm.makeStringIndex(10).values
- level2 = tm.makeStringIndex(1000).values
+ level1 = Index([f"i-{i}" for i in range(10)], dtype=object).values
+ level2 = Index([f"i-{i}" for i in range(1000)], dtype=object).values
codes1 = np.arange(10).repeat(1000)
codes2 = np.tile(np.arange(1000), 10)
index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
@@ -212,8 +210,8 @@ class JoinNonUnique:
# outer join of non-unique
# GH 6329
def setup(self):
- date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T")
- daily_dates = date_index.to_period("D").to_timestamp("S", "S")
+ date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="min")
+ daily_dates = date_index.to_period("D").to_timestamp("s", "s")
self.fracofday = date_index.values - daily_dates.values
self.fracofday = self.fracofday.astype("timedelta64[ns]")
self.fracofday = self.fracofday.astype(np.float64) / 86_400_000_000_000
@@ -231,8 +229,8 @@ class Merge:
def setup(self, sort):
N = 10000
- indices = tm.makeStringIndex(N).values
- indices2 = tm.makeStringIndex(N).values
+ indices = Index([f"i-{i}" for i in range(N)], dtype=object).values
+ indices2 = Index([f"i-{i}" for i in range(N)], dtype=object).values
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
self.left = DataFrame(
@@ -277,18 +275,21 @@ def time_merge_dataframes_cross(self, sort):
class MergeEA:
params = [
- "Int64",
- "Int32",
- "Int16",
- "UInt64",
- "UInt32",
- "UInt16",
- "Float64",
- "Float32",
+ [
+ "Int64",
+ "Int32",
+ "Int16",
+ "UInt64",
+ "UInt32",
+ "UInt16",
+ "Float64",
+ "Float32",
+ ],
+ [True, False],
]
- param_names = ["dtype"]
+ param_names = ["dtype", "monotonic"]
- def setup(self, dtype):
+ def setup(self, dtype, monotonic):
N = 10_000
indices = np.arange(1, N)
key = np.tile(indices[:8000], 10)
@@ -301,8 +302,11 @@ def setup(self, dtype):
"value2": np.random.randn(7999),
}
)
+ if monotonic:
+ self.left = self.left.sort_values("key")
+ self.right = self.right.sort_values("key")
- def time_merge(self, dtype):
+ def time_merge(self, dtype, monotonic):
merge(self.left, self.right)
@@ -332,13 +336,14 @@ class MergeDatetime:
("ns", "ms"),
],
[None, "Europe/Brussels"],
+ [True, False],
]
- param_names = ["units", "tz"]
+ param_names = ["units", "tz", "monotonic"]
- def setup(self, units, tz):
+ def setup(self, units, tz, monotonic):
unit_left, unit_right = units
N = 10_000
- keys = Series(date_range("2012-01-01", freq="T", periods=N, tz=tz))
+ keys = Series(date_range("2012-01-01", freq="min", periods=N, tz=tz))
self.left = DataFrame(
{
"key": keys.sample(N * 10, replace=True).dt.as_unit(unit_left),
@@ -351,8 +356,11 @@ def setup(self, units, tz):
"value2": np.random.randn(8000),
}
)
+ if monotonic:
+ self.left = self.left.sort_values("key")
+ self.right = self.right.sort_values("key")
- def time_merge(self, units, tz):
+ def time_merge(self, units, tz, monotonic):
merge(self.left, self.right)
@@ -360,14 +368,14 @@ class MergeCategoricals:
def setup(self):
self.left_object = DataFrame(
{
- "X": np.random.choice(range(0, 10), size=(10000,)),
+ "X": np.random.choice(range(10), size=(10000,)),
"Y": np.random.choice(["one", "two", "three"], size=(10000,)),
}
)
self.right_object = DataFrame(
{
- "X": np.random.choice(range(0, 10), size=(10000,)),
+ "X": np.random.choice(range(10), size=(10000,)),
"Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
}
)
@@ -400,7 +408,7 @@ def time_merge_on_cat_idx(self):
class MergeOrdered:
def setup(self):
- groups = tm.makeStringIndex(10).values
+ groups = Index([f"i-{i}" for i in range(10)], dtype=object).values
self.left = DataFrame(
{
"group": groups.repeat(5000),
diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py
index f041499c9c622..3419163bcfe09 100644
--- a/asv_bench/benchmarks/libs.py
+++ b/asv_bench/benchmarks/libs.py
@@ -15,13 +15,11 @@
from pandas import (
NA,
+ Index,
NaT,
)
-from .pandas_vb_common import (
- lib,
- tm,
-)
+from .pandas_vb_common import lib
try:
from pandas.util import cache_readonly
@@ -61,8 +59,8 @@ class FastZip:
def setup(self):
N = 10000
K = 10
- key1 = tm.makeStringIndex(N).values.repeat(K)
- key2 = tm.makeStringIndex(N).values.repeat(K)
+ key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K)
+ key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K)
col_array = np.vstack([key1, key2, np.random.randn(N * K)])
col_array2 = col_array.copy()
col_array2[:, :10000] = np.nan
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index 87dcdb16fa647..54788d41d83fe 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -5,6 +5,7 @@
from pandas import (
NA,
DataFrame,
+ Index,
MultiIndex,
RangeIndex,
Series,
@@ -12,8 +13,6 @@
date_range,
)
-from .pandas_vb_common import tm
-
class GetLoc:
def setup(self):
@@ -144,7 +143,11 @@ def time_is_monotonic(self):
class Duplicated:
def setup(self):
n, k = 200, 5000
- levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
+ levels = [
+ np.arange(n),
+ Index([f"i-{i}" for i in range(n)], dtype=object).values,
+ 1000 + np.arange(n),
+ ]
codes = [np.random.choice(n, (k * n)) for lev in levels]
self.mi = MultiIndex(levels=levels, codes=codes)
@@ -249,7 +252,7 @@ def setup(self, index_structure, dtype, method, sort):
level2 = range(N // 1000)
int_left = MultiIndex.from_product([level1, level2])
- level2 = tm.makeStringIndex(N // 1000).values
+ level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values
str_left = MultiIndex.from_product([level1, level2])
level2 = range(N // 1000)
@@ -293,7 +296,7 @@ def setup(self, dtype):
level2[0] = NA
ea_int_left = MultiIndex.from_product([level1, level2])
- level2 = tm.makeStringIndex(N // 1000).values
+ level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values
str_left = MultiIndex.from_product([level1, level2])
data = {
@@ -354,7 +357,7 @@ def setup(self, dtype):
level2 = range(N // 1000)
int_midx = MultiIndex.from_product([level1, level2])
- level2 = tm.makeStringIndex(N // 1000).values
+ level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values
str_midx = MultiIndex.from_product([level1, level2])
data = {
@@ -411,7 +414,7 @@ def setup(self, dtype):
elif dtype == "int64":
level2 = range(N2)
elif dtype == "string":
- level2 = tm.makeStringIndex(N2)
+ level2 = Index([f"i-{i}" for i in range(N2)], dtype=object)
else:
raise NotImplementedError
diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py
index 501fe198d41d8..ccd86cae06d58 100644
--- a/asv_bench/benchmarks/period.py
+++ b/asv_bench/benchmarks/period.py
@@ -45,7 +45,7 @@ def time_from_ints_daily(self, freq, is_offset):
class DataFramePeriodColumn:
def setup(self):
- self.rng = period_range(start="1/1/1990", freq="S", periods=20000)
+ self.rng = period_range(start="1/1/1990", freq="s", periods=20000)
self.df = DataFrame(index=range(len(self.rng)))
def time_setitem_period_column(self):
diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
index 1c5e6050db275..3d22bfce7e2b2 100644
--- a/asv_bench/benchmarks/reindex.py
+++ b/asv_bench/benchmarks/reindex.py
@@ -9,8 +9,6 @@
period_range,
)
-from .pandas_vb_common import tm
-
class Reindex:
def setup(self):
@@ -23,8 +21,8 @@ def setup(self):
)
N = 5000
K = 200
- level1 = tm.makeStringIndex(N).values.repeat(K)
- level2 = np.tile(tm.makeStringIndex(K).values, N)
+ level1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K)
+ level2 = np.tile(Index([f"i-{i}" for i in range(K)], dtype=object).values, N)
index = MultiIndex.from_arrays([level1, level2])
self.s = Series(np.random.randn(N * K), index=index)
self.s_subset = self.s[::2]
@@ -93,8 +91,8 @@ class DropDuplicates:
def setup(self, inplace):
N = 10000
K = 10
- key1 = tm.makeStringIndex(N).values.repeat(K)
- key2 = tm.makeStringIndex(N).values.repeat(K)
+ key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K)
+ key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K)
self.df = DataFrame(
{"key1": key1, "key2": key2, "value": np.random.randn(N * K)}
)
@@ -102,7 +100,9 @@ def setup(self, inplace):
self.df_nan.iloc[:10000, :] = np.nan
self.s = Series(np.random.randint(0, 1000, size=10000))
- self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))
+ self.s_str = Series(
+ np.tile(Index([f"i-{i}" for i in range(1000)], dtype=object).values, 10)
+ )
N = 1000000
K = 10000
@@ -133,7 +133,7 @@ class Align:
# blog "pandas escaped the zoo"
def setup(self):
n = 50000
- indices = tm.makeStringIndex(n)
+ indices = Index([f"i-{i}" for i in range(n)], dtype=object)
subsample_size = 40000
self.x = Series(np.random.randn(n), indices)
self.y = Series(
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 288369145576e..b021af4694d7d 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -10,8 +10,6 @@
date_range,
)
-from .pandas_vb_common import tm
-
class SeriesConstructor:
def setup(self):
@@ -28,9 +26,6 @@ def time_constructor_dict(self):
def time_constructor_no_data(self):
Series(data=None, index=self.idx)
- def time_constructor_fastpath(self):
- Series(self.array, index=self.idx2, name="name", fastpath=True)
-
class ToFrame:
params = [["int64", "datetime64[ns]", "category", "Int64"], [None, "foo"]]
@@ -67,7 +62,7 @@ def setup(self, dtype):
N = 10**6
data = {
"int": np.random.randint(1, 10, N),
- "datetime": date_range("2000-01-01", freq="S", periods=N),
+ "datetime": date_range("2000-01-01", freq="s", periods=N),
}
self.s = Series(data[dtype])
if dtype == "datetime":
@@ -95,7 +90,7 @@ class Fillna:
def setup(self, dtype):
N = 10**6
if dtype == "datetime64[ns]":
- data = date_range("2000-01-01", freq="S", periods=N)
+ data = date_range("2000-01-01", freq="s", periods=N)
na_value = NaT
elif dtype in ("float64", "Float64"):
data = np.random.randn(N)
@@ -256,7 +251,7 @@ def time_mode(self, N):
class Dir:
def setup(self):
- self.s = Series(index=tm.makeStringIndex(10000))
+ self.s = Series(index=Index([f"i-{i}" for i in range(10000)], dtype=object))
def time_dir_strings(self):
dir(self.s)
@@ -320,7 +315,7 @@ def setup(self, func, N, dtype):
if func == "argmax" and dtype in {"Int64", "boolean"}:
# Skip argmax for nullable int since this doesn't work yet (GH-24382)
raise NotImplementedError
- self.s = Series([1] * N, dtype=dtype)
+ self.s = Series(np.ones(N), dtype=dtype)
self.func = getattr(self.s, func)
def time_func(self, func, N, dtype):
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
index c8a9a9e6e9176..22a5511e4c678 100644
--- a/asv_bench/benchmarks/sparse.py
+++ b/asv_bench/benchmarks/sparse.py
@@ -22,7 +22,7 @@ class SparseSeriesToFrame:
def setup(self):
K = 50
N = 50001
- rng = date_range("1/1/2000", periods=N, freq="T")
+ rng = date_range("1/1/2000", periods=N, freq="min")
self.series = {}
for i in range(1, K):
data = np.random.randn(N)[:-i]
diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 1652fcf8d48da..89bda81ccf08c 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -20,6 +20,39 @@ def time_op(self, op, dtype, axis):
self.df_func(axis=axis)
+class FrameMixedDtypesOps:
+ params = [ops, [0, 1, None]]
+ param_names = ["op", "axis"]
+
+ def setup(self, op, axis):
+ if op in ("sum", "skew", "kurt", "prod", "sem", "var") or (
+ (op, axis)
+ in (
+ ("mean", 1),
+ ("mean", None),
+ ("median", 1),
+ ("median", None),
+ ("std", 1),
+ )
+ ):
+ # Skipping cases where datetime aggregations are not implemented
+ raise NotImplementedError
+
+ N = 1_000_000
+ df = pd.DataFrame(
+ {
+ "f": np.random.normal(0.0, 1.0, N),
+ "i": np.random.randint(0, N, N),
+ "ts": pd.date_range(start="1/1/2000", periods=N, freq="h"),
+ }
+ )
+
+ self.df_func = getattr(df, op)
+
+ def time_op(self, op, axis):
+ self.df_func(axis=axis)
+
+
class FrameMultiIndexOps:
params = [ops]
param_names = ["op"]
diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py
index 39cc82e1bdf79..47f25b331ab9b 100644
--- a/asv_bench/benchmarks/strftime.py
+++ b/asv_bench/benchmarks/strftime.py
@@ -53,7 +53,7 @@ def time_frame_datetime_formatting_custom(self, nobs):
class PeriodStrftime:
timeout = 1500
- params = ([1000, 10000], ["D", "H"])
+ params = ([1000, 10000], ["D", "h"])
param_names = ["nobs", "freq"]
def setup(self, nobs, freq):
@@ -67,7 +67,7 @@ def setup(self, nobs, freq):
self.data.set_index("i", inplace=True)
if freq == "D":
self.default_fmt = "%Y-%m-%d"
- elif freq == "H":
+ elif freq == "h":
self.default_fmt = "%Y-%m-%d %H:00"
def time_frame_period_to_str(self, nobs, freq):
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index d70d9d0aa5227..1f4a104255057 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -6,12 +6,11 @@
NA,
Categorical,
DataFrame,
+ Index,
Series,
)
from pandas.arrays import StringArray
-from .pandas_vb_common import tm
-
class Dtypes:
params = ["str", "string[python]", "string[pyarrow]"]
@@ -19,7 +18,9 @@ class Dtypes:
def setup(self, dtype):
try:
- self.s = Series(tm.makeStringIndex(10**5), dtype=dtype)
+ self.s = Series(
+ Index([f"i-{i}" for i in range(10000)], dtype=object), dtype=dtype
+ )
except ImportError:
raise NotImplementedError
@@ -172,7 +173,7 @@ class Repeat:
def setup(self, repeats):
N = 10**5
- self.s = Series(tm.makeStringIndex(N))
+ self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object))
repeat = {"int": 1, "array": np.random.randint(1, 3, N)}
self.values = repeat[repeats]
@@ -187,13 +188,20 @@ class Cat:
def setup(self, other_cols, sep, na_rep, na_frac):
N = 10**5
mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac])
- self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
+ self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object)).where(
+ mask_gen()
+ )
if other_cols == 0:
# str.cat self-concatenates only for others=None
self.others = None
else:
self.others = DataFrame(
- {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)}
+ {
+ i: Index([f"i-{i}" for i in range(N)], dtype=object).where(
+ mask_gen()
+ )
+ for i in range(other_cols)
+ }
)
def time_cat(self, other_cols, sep, na_rep, na_frac):
@@ -245,7 +253,8 @@ def time_extract_single_group(self, dtype, expand):
class Dummies(Dtypes):
def setup(self, dtype):
super().setup(dtype)
- self.s = self.s.str.join("|")
+ N = len(self.s) // 5
+ self.s = self.s[:N].str.join("|")
def time_get_dummies(self, dtype):
self.s.str.get_dummies("|")
@@ -253,7 +262,7 @@ def time_get_dummies(self, dtype):
class Encode:
def setup(self):
- self.ser = Series(tm.makeStringIndex())
+ self.ser = Series(Index([f"i-{i}" for i in range(10_000)], dtype=object))
def time_encode_decode(self):
self.ser.str.encode("utf-8").str.decode("utf-8")
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
index 1253fefde2d5f..8e1deb99a66a4 100644
--- a/asv_bench/benchmarks/timeseries.py
+++ b/asv_bench/benchmarks/timeseries.py
@@ -27,7 +27,7 @@ def setup(self, index_type):
N = 100000
dtidxes = {
"dst": date_range(
- start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s"
),
"repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10),
"tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"),
@@ -72,13 +72,13 @@ class TzLocalize:
def setup(self, tz):
dst_rng = date_range(
- start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s"
)
- self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S")
+ self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="s")
self.index = self.index.append(dst_rng)
self.index = self.index.append(dst_rng)
self.index = self.index.append(
- date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S")
+ date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="s")
)
def time_infer_dst(self, tz):
@@ -90,7 +90,7 @@ class ResetIndex:
param_names = "tz"
def setup(self, tz):
- idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz)
+ idx = date_range(start="1/1/2000", periods=1000, freq="h", tz=tz)
self.df = DataFrame(np.random.randn(1000, 2), index=idx)
def time_reset_datetimeindex(self, tz):
@@ -116,7 +116,7 @@ def time_infer_freq(self, freq):
class TimeDatetimeConverter:
def setup(self):
N = 100000
- self.rng = date_range(start="1/1/2000", periods=N, freq="T")
+ self.rng = date_range(start="1/1/2000", periods=N, freq="min")
def time_convert(self):
DatetimeConverter.convert(self.rng, None, None)
@@ -129,9 +129,9 @@ class Iteration:
def setup(self, time_index):
N = 10**6
if time_index is timedelta_range:
- self.idx = time_index(start=0, freq="T", periods=N)
+ self.idx = time_index(start=0, freq="min", periods=N)
else:
- self.idx = time_index(start="20140101", freq="T", periods=N)
+ self.idx = time_index(start="20140101", freq="min", periods=N)
self.exit = 10000
def time_iter(self, time_index):
@@ -149,7 +149,7 @@ class ResampleDataFrame:
param_names = ["method"]
def setup(self, method):
- rng = date_range(start="20130101", periods=100000, freq="50L")
+ rng = date_range(start="20130101", periods=100000, freq="50ms")
df = DataFrame(np.random.randn(100000, 2), index=rng)
self.resample = getattr(df.resample("1s"), method)
@@ -163,8 +163,8 @@ class ResampleSeries:
def setup(self, index, freq, method):
indexes = {
- "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"),
- "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"),
+ "period": period_range(start="1/1/2000", end="1/1/2001", freq="min"),
+ "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="min"),
}
idx = indexes[index]
ts = Series(np.random.randn(len(idx)), index=idx)
@@ -178,7 +178,7 @@ class ResampleDatetetime64:
# GH 7754
def setup(self):
rng3 = date_range(
- start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U"
+ start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000us"
)
self.dt_ts = Series(5, rng3, dtype="datetime64[ns]")
@@ -255,7 +255,7 @@ def time_get_slice(self, monotonic):
class Lookup:
def setup(self):
N = 1500000
- rng = date_range(start="1/1/2000", periods=N, freq="S")
+ rng = date_range(start="1/1/2000", periods=N, freq="s")
self.ts = Series(1, index=rng)
self.lookup_val = rng[N // 2]
@@ -270,7 +270,7 @@ class DatetimeAccessor:
def setup(self, tz):
N = 100000
- self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz))
+ self.series = Series(date_range(start="1/1/2000", periods=N, freq="min", tz=tz))
def time_dt_accessor(self, tz):
self.series.dt
diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py
index a92fbbe8d4dbe..af3bfac6d3d01 100644
--- a/asv_bench/benchmarks/tslibs/period.py
+++ b/asv_bench/benchmarks/tslibs/period.py
@@ -72,7 +72,7 @@ def time_now(self, freq):
self.per.now(freq)
def time_asfreq(self, freq):
- self.per.asfreq("A")
+ self.per.asfreq("Y")
def time_str(self, freq):
str(self.per)
@@ -151,7 +151,11 @@ def setup(self, size, freq, tz):
# tzlocal is cumbersomely slow, so skip to keep runtime in check
raise NotImplementedError
- arr = np.arange(10, dtype="i8").repeat(size // 10)
+ # we pick 2**55 because smaller values end up returning
+ # -1 from npy_datetimestruct_to_datetime with NPY_FR_Y frequency
+ # this artificially slows down functions since -1 is also the
+ # error sentinel
+ arr = np.arange(2**55, 2**55 + 10, dtype="i8").repeat(size // 10)
self.i8values = arr
def time_dt64arr_to_periodarr(self, size, freq, tz):
diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py
index d7706a39dfae5..082220ee0dff2 100644
--- a/asv_bench/benchmarks/tslibs/timestamp.py
+++ b/asv_bench/benchmarks/tslibs/timestamp.py
@@ -136,10 +136,10 @@ def time_to_julian_date(self, tz):
self.ts.to_julian_date()
def time_floor(self, tz):
- self.ts.floor("5T")
+ self.ts.floor("5min")
def time_ceil(self, tz):
- self.ts.ceil("5T")
+ self.ts.ceil("5min")
class TimestampAcrossDst:
diff --git a/ci/condarc.yml b/ci/.condarc
similarity index 100%
rename from ci/condarc.yml
rename to ci/.condarc
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index aba42f3733a3f..e41f625e583c0 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -14,6 +14,8 @@
# $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free
# $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks
+set -uo pipefail
+
[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
@@ -63,16 +65,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
MSG='Partially validate docstrings (EX03)' ; echo $MSG
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
- pandas.Series.loc \
- pandas.Series.iloc \
- pandas.Series.pop \
- pandas.Series.describe \
- pandas.Series.skew \
- pandas.Series.var \
- pandas.Series.last \
- pandas.Series.tz_convert \
- pandas.Series.tz_localize \
- pandas.Series.dt.month_name \
pandas.Series.dt.day_name \
pandas.Series.str.len \
pandas.Series.cat.set_categories \
@@ -186,9 +178,8 @@ fi
### SINGLE-PAGE DOCS ###
if [[ -z "$CHECK" || "$CHECK" == "single-docs" ]]; then
- python doc/make.py --warnings-are-errors --single pandas.Series.value_counts
- python doc/make.py --warnings-are-errors --single pandas.Series.str.split
- python doc/make.py clean
+ python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.value_counts
+ python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.str.split
fi
exit $RET
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index ffa7732c604a0..45f114322015b 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -7,15 +7,15 @@ dependencies:
# build dependencies
- versioneer[toml]
- cython>=0.29.33
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- pytest-xdist>=2.2.0
- - pytest-asyncio>=0.17.0
- pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
- boto3
# required dependencies
@@ -24,40 +24,40 @@ dependencies:
- pytz
# optional dependencies
- - beautifulsoup4>=4.11.1
- - blosc>=1.21.0
- - bottleneck>=1.3.4
- - brotlipy>=0.7.0
- - fastparquet>=0.8.1
- - fsspec>=2022.05.0
+ - beautifulsoup4>=4.11.2
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2022.12.0
+ - fsspec>=2022.11.0
- html5lib>=1.1
- hypothesis>=6.46.1
- - gcsfs>=2022.05.0
+ - gcsfs>=2022.11.0
- jinja2>=3.1.2
- - lxml>=4.8.0
- - matplotlib>=3.6.1
- - numba>=0.55.2
- - numexpr>=2.8.0
+ - lxml>=4.9.2
+ - matplotlib>=3.6.3
+ - numba>=0.56.4
+ - numexpr>=2.8.4
- odfpy>=1.4.1
- - qtpy>=2.2.0
- - openpyxl>=3.0.10
- - pandas-gbq>=0.17.5
- - psycopg2>=2.9.3
- - pyarrow>=7.0.0
+ - qtpy>=2.3.0
+ - openpyxl>=3.1.0
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
- pymysql>=1.0.2
- - pyreadstat>=1.1.5
- - pytables>=3.7.0
- - python-snappy>=0.6.1
- - pyxlsb>=1.0.9
- - s3fs>=2022.05.0
- - scipy>=1.8.1
- - sqlalchemy>=1.4.36
- - tabulate>=0.8.10
- - xarray>=2022.03.0
+ - pyqt>=5.15.9
+ - pyreadstat>=1.2.0
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pyxlsb>=1.0.10
+ - s3fs>=2022.11.0
+ - scipy>=1.10.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2022.12.0
- xlrd>=2.0.1
- - xlsxwriter>=3.0.3
- - zstandard>=0.17.0
+ - xlsxwriter>=3.0.5
+ - zstandard>=0.19.0
- pip:
- - pyqt5>=5.15.6
- - tzdata>=2022.1
+ - adbc-driver-postgresql>=0.8.0
+ - adbc-driver-sqlite>=0.8.0
+ - tzdata>=2022.7
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 5a6a26c2e1ad8..d6bf9ec7843de 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -8,15 +8,15 @@ dependencies:
# build dependencies
- versioneer[toml]
- cython>=0.29.33
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- pytest-xdist>=2.2.0
- - pytest-asyncio>=0.17.0
- pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
- boto3
# required dependencies
@@ -25,39 +25,38 @@ dependencies:
- pytz
# optional dependencies
- - beautifulsoup4>=4.11.1
- - blosc>=1.21.0
- - bottleneck>=1.3.4
- - brotlipy>=0.7.0
- - fastparquet>=0.8.1
- - fsspec>=2022.05.0
+ - beautifulsoup4>=4.11.2
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2022.12.0
+ - fsspec>=2022.11.0
- html5lib>=1.1
- hypothesis>=6.46.1
- - gcsfs>=2022.05.0
+ - gcsfs>=2022.11.0
- jinja2>=3.1.2
- - lxml>=4.8.0
- - matplotlib>=3.6.1
- - numba>=0.55.2
- - numexpr>=2.8.0
+ - lxml>=4.9.2
+ - matplotlib>=3.6.3
+ - numba>=0.56.4
+ - numexpr>=2.8.4
- odfpy>=1.4.1
- - qtpy>=2.2.0
- - openpyxl>=3.0.10
- - pandas-gbq>=0.17.5
- - psycopg2>=2.9.3
- - pyarrow>=7.0.0
+ - qtpy>=2.3.0
+ - openpyxl>=3.1.0
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
- pymysql>=1.0.2
- - pyreadstat>=1.1.5
- - pytables>=3.7.0
- - python-snappy>=0.6.1
- - pyxlsb>=1.0.9
- - s3fs>=2022.05.0
- - scipy>=1.8.1
- - sqlalchemy>=1.4.36
- - tabulate>=0.8.10
- - xarray>=2022.03.0
+ - pyqt>=5.15.9
+ - pyreadstat>=1.2.0
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pyxlsb>=1.0.10
+ - s3fs>=2022.11.0
+ - scipy>=1.10.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2022.12.0
- xlrd>=2.0.1
- - xlsxwriter>=3.0.3
- - zstandard>=0.17.0
+ - xlsxwriter>=3.0.5
+ - zstandard>=0.19.0
# downstream packages
- botocore
@@ -73,6 +72,7 @@ dependencies:
- pyyaml
- py
- pip:
+ - adbc-driver-postgresql>=0.8.0
+ - adbc-driver-sqlite>=0.8.0
- dataframe-api-compat>=0.1.7
- - pyqt5>=5.15.6
- - tzdata>=2022.1
+ - tzdata>=2022.7
diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml
index 2cd4d5f3528f8..b62e8630f2059 100644
--- a/ci/deps/actions-311-numpydev.yaml
+++ b/ci/deps/actions-311-numpydev.yaml
@@ -6,8 +6,9 @@ dependencies:
# build dependencies
- versioneer[toml]
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
+ - cython>=0.29.33
# test dependencies
- pytest>=7.3.2
@@ -17,7 +18,6 @@ dependencies:
# causes an InternalError within pytest
- pytest-xdist>=2.2.0, <3
- hypothesis>=6.46.1
- - pytest-asyncio>=0.17.0
# pandas dependencies
- python-dateutil
@@ -25,9 +25,7 @@ dependencies:
- pip
- pip:
- - "cython"
- "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
- "--pre"
- "numpy"
- - "scipy"
- - "tzdata>=2022.1"
+ - "tzdata>=2022.7"
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index f24e866af0439..d84063ac2a9ba 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -6,7 +6,7 @@ dependencies:
# build dependencies
- versioneer[toml]
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- cython>=0.29.33
- meson-python=0.13.1
@@ -15,7 +15,6 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- hypothesis>=6.46.1
- - pytest-asyncio>=0.17.0
# required dependencies
- python-dateutil
@@ -24,7 +23,7 @@ dependencies:
- pip
- pip:
- - "tzdata>=2022.1"
+ - "tzdata>=2022.7"
- "--extra-index-url https://pypi.fury.io/arrow-nightlies/"
- "--prefer-binary"
- "--pre"
diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml
new file mode 100644
index 0000000000000..f5f04c90bffad
--- /dev/null
+++ b/ci/deps/actions-311-sanitizers.yaml
@@ -0,0 +1,32 @@
+name: pandas-dev
+channels:
+ - conda-forge
+dependencies:
+ - python=3.11
+
+ # build dependencies
+ - versioneer[toml]
+ - cython>=0.29.33
+ - meson[ninja]=1.2.1
+ - meson-python=0.13.1
+
+ # test dependencies
+ - pytest>=7.3.2
+ - pytest-cov
+ - pytest-xdist>=2.2.0
+ - pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
+ - boto3
+ - hypothesis>=6.46.1
+ - pyqt>=5.15.9
+
+ # required dependencies
+ - python-dateutil
+ - numpy
+ - pytz
+
+ # pandas dependencies
+ - pip
+
+ - pip:
+ - "tzdata>=2022.7"
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index 9d60d734db5b3..d14686696e669 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -7,15 +7,15 @@ dependencies:
# build dependencies
- versioneer[toml]
- cython>=0.29.33
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- pytest-xdist>=2.2.0
- - pytest-asyncio>=0.17.0
- pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
- boto3
# required dependencies
@@ -24,40 +24,40 @@ dependencies:
- pytz
# optional dependencies
- - beautifulsoup4>=4.11.1
- - blosc>=1.21.0
- - bottleneck>=1.3.4
- - brotlipy>=0.7.0
- - fastparquet>=0.8.1
- - fsspec>=2022.05.0
+ - beautifulsoup4>=4.11.2
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2022.12.0
+ - fsspec>=2022.11.0
- html5lib>=1.1
- hypothesis>=6.46.1
- - gcsfs>=2022.05.0
+ - gcsfs>=2022.11.0
- jinja2>=3.1.2
- - lxml>=4.8.0
- - matplotlib>=3.6.1
- - numba>=0.55.2
- - numexpr>=2.8.0
+ - lxml>=4.9.2
+ - matplotlib>=3.6.3
+ - numba>=0.56.4
+ - numexpr>=2.8.4
- odfpy>=1.4.1
- - qtpy>=2.2.0
- - openpyxl>=3.0.10
- - pandas-gbq>=0.17.5
- - psycopg2>=2.9.3
- - pyarrow>=7.0.0
+ - qtpy>=2.3.0
+ - pyqt>=5.15.9
+ - openpyxl>=3.1.0
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
- pymysql>=1.0.2
- - pyreadstat>=1.1.5
- # - pytables>=3.7.0, 3.8.0 is first version that supports 3.11
- - python-snappy>=0.6.1
- - pyxlsb>=1.0.9
- - s3fs>=2022.05.0
- - scipy>=1.8.1
- - sqlalchemy>=1.4.36
- - tabulate>=0.8.10
- - xarray>=2022.03.0
+ - pyreadstat>=1.2.0
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pyxlsb>=1.0.10
+ - s3fs>=2022.11.0
+ - scipy>=1.10.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2022.12.0
- xlrd>=2.0.1
- - xlsxwriter>=3.0.3
- - zstandard>=0.17.0
+ - xlsxwriter>=3.0.5
+ - zstandard>=0.19.0
- pip:
- - pyqt5>=5.15.6
- - tzdata>=2022.1
+ - adbc-driver-postgresql>=0.8.0
+ - adbc-driver-sqlite>=0.8.0
+ - tzdata>=2022.7
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
new file mode 100644
index 0000000000000..86aaf24b4e15c
--- /dev/null
+++ b/ci/deps/actions-312.yaml
@@ -0,0 +1,63 @@
+name: pandas-dev-312
+channels:
+ - conda-forge
+dependencies:
+ - python=3.12
+
+ # build dependencies
+ - versioneer[toml]
+ - cython>=0.29.33
+ - meson[ninja]=1.2.1
+ - meson-python=0.13.1
+
+ # test dependencies
+ - pytest>=7.3.2
+ - pytest-cov
+ - pytest-xdist>=2.2.0
+ - pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
+ - boto3
+
+ # required dependencies
+ - python-dateutil
+ - numpy
+ - pytz
+
+ # optional dependencies
+ - beautifulsoup4>=4.11.2
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2022.12.0
+ - fsspec>=2022.11.0
+ - html5lib>=1.1
+ - hypothesis>=6.46.1
+ - gcsfs>=2022.11.0
+ - jinja2>=3.1.2
+ - lxml>=4.9.2
+ - matplotlib>=3.6.3
+ # - numba>=0.56.4
+ - numexpr>=2.8.4
+ - odfpy>=1.4.1
+ - qtpy>=2.3.0
+ - pyqt>=5.15.9
+ - openpyxl>=3.1.0
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
+ - pymysql>=1.0.2
+ - pyreadstat>=1.2.0
+ # - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pyxlsb>=1.0.10
+ - s3fs>=2022.11.0
+ - scipy>=1.10.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2022.12.0
+ - xlrd>=2.0.1
+ - xlsxwriter>=3.0.5
+ - zstandard>=0.19.0
+
+ - pip:
+ - adbc-driver-postgresql>=0.8.0
+ - adbc-driver-sqlite>=0.8.0
+ - tzdata>=2022.7
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
index 0e2fcf87c2d6e..7067048c4434d 100644
--- a/ci/deps/actions-39-minimum_versions.yaml
+++ b/ci/deps/actions-39-minimum_versions.yaml
@@ -9,15 +9,15 @@ dependencies:
# build dependencies
- versioneer[toml]
- cython>=0.29.33
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- pytest-xdist>=2.2.0
- - pytest-asyncio>=0.17.0
- pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
- boto3
# required dependencies
@@ -26,41 +26,41 @@ dependencies:
- pytz=2020.1
# optional dependencies
- - beautifulsoup4=4.11.1
- - blosc=1.21.0
- - bottleneck=1.3.4
- - brotlipy=0.7.0
- - fastparquet=0.8.1
- - fsspec=2022.05.0
+ - beautifulsoup4=4.11.2
+ - blosc=1.21.3
+ - bottleneck=1.3.6
+ - fastparquet=2022.12.0
+ - fsspec=2022.11.0
- html5lib=1.1
- hypothesis=6.46.1
- - gcsfs=2022.05.0
+ - gcsfs=2022.11.0
- jinja2=3.1.2
- - lxml=4.8.0
- - matplotlib=3.6.1
- - numba=0.55.2
- - numexpr=2.8.0
+ - lxml=4.9.2
+ - matplotlib=3.6.3
+ - numba=0.56.4
+ - numexpr=2.8.4
- odfpy=1.4.1
- - qtpy=2.2.0
- - openpyxl=3.0.10
- - pandas-gbq=0.17.5
- - psycopg2=2.9.3
- - pyarrow=7.0.0
+ - qtpy=2.3.0
+ - openpyxl=3.1.0
+ - psycopg2=2.9.6
+ - pyarrow=10.0.1
- pymysql=1.0.2
- - pyreadstat=1.1.5
- - pytables=3.7.0
- - python-snappy=0.6.1
- - pyxlsb=1.0.9
- - s3fs=2022.05.0
- - scipy=1.8.1
- - sqlalchemy=1.4.36
- - tabulate=0.8.10
- - xarray=2022.03.0
+ - pyqt=5.15.9
+ - pyreadstat=1.2.0
+ - pytables=3.8.0
+ - python-calamine=0.1.7
+ - pyxlsb=1.0.10
+ - s3fs=2022.11.0
+ - scipy=1.10.0
+ - sqlalchemy=2.0.0
+ - tabulate=0.9.0
+ - xarray=2022.12.0
- xlrd=2.0.1
- - xlsxwriter=3.0.3
- - zstandard=0.17.0
+ - xlsxwriter=3.0.5
+ - zstandard=0.19.0
- pip:
+ - adbc-driver-postgresql==0.8.0
+ - adbc-driver-sqlite==0.8.0
- dataframe-api-compat==0.1.7
- - pyqt5==5.15.6
- - tzdata==2022.1
+ - tzdata==2022.7
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
index 6ea0d41b947dc..31ee74174cd46 100644
--- a/ci/deps/actions-39.yaml
+++ b/ci/deps/actions-39.yaml
@@ -7,15 +7,15 @@ dependencies:
# build dependencies
- versioneer[toml]
- cython>=0.29.33
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- pytest-xdist>=2.2.0
- - pytest-asyncio>=0.17.0
- pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
- boto3
# required dependencies
@@ -24,40 +24,40 @@ dependencies:
- pytz
# optional dependencies
- - beautifulsoup4>=4.11.1
- - blosc>=1.21.0
- - bottleneck>=1.3.4
- - brotlipy>=0.7.0
- - fastparquet>=0.8.1
- - fsspec>=2022.05.0
+ - beautifulsoup4>=4.11.2
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2022.12.0
+ - fsspec>=2022.11.0
- html5lib>=1.1
- hypothesis>=6.46.1
- - gcsfs>=2022.05.0
+ - gcsfs>=2022.11.0
- jinja2>=3.1.2
- - lxml>=4.8.0
- - matplotlib>=3.6.1
- - numba>=0.55.2
- - numexpr>=2.8.0
+ - lxml>=4.9.2
+ - matplotlib>=3.6.3
+ - numba>=0.56.4
+ - numexpr>=2.8.4
- odfpy>=1.4.1
- - qtpy>=2.2.0
- - openpyxl>=3.0.10
- - pandas-gbq>=0.17.5
- - psycopg2>=2.9.3
- - pyarrow>=7.0.0
+ - qtpy>=2.3.0
+ - openpyxl>=3.1.0
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
- pymysql>=1.0.2
- - pyreadstat>=1.1.5
- - pytables>=3.7.0
- - python-snappy>=0.6.1
- - pyxlsb>=1.0.9
- - s3fs>=2022.05.0
- - scipy>=1.8.1
- - sqlalchemy>=1.4.36
- - tabulate>=0.8.10
- - xarray>=2022.03.0
+ - pyqt>=5.15.9
+ - pyreadstat>=1.2.0
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pyxlsb>=1.0.10
+ - s3fs>=2022.11.0
+ - scipy>=1.10.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2022.12.0
- xlrd>=2.0.1
- - xlsxwriter>=3.0.3
- - zstandard>=0.17.0
+ - xlsxwriter>=3.0.5
+ - zstandard>=0.19.0
- pip:
- - pyqt5>=5.15.6
- - tzdata>=2022.1
+ - adbc-driver-postgresql>=0.8.0
+ - adbc-driver-sqlite>=0.8.0
+ - tzdata>=2022.7
diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
index 035395d55eb3a..d9c8dd81b7c33 100644
--- a/ci/deps/actions-pypy-39.yaml
+++ b/ci/deps/actions-pypy-39.yaml
@@ -10,13 +10,12 @@ dependencies:
# build dependencies
- versioneer[toml]
- cython>=0.29.33
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- - pytest-asyncio>=0.17.0
- pytest-xdist>=2.2.0
- hypothesis>=6.46.1
@@ -25,4 +24,4 @@ dependencies:
- python-dateutil
- pytz
- pip:
- - tzdata>=2022.1
+ - tzdata>=2022.7
diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
index df4e8e285bd02..a19ffd485262d 100644
--- a/ci/deps/circle-310-arm64.yaml
+++ b/ci/deps/circle-310-arm64.yaml
@@ -7,15 +7,15 @@ dependencies:
# build dependencies
- versioneer[toml]
- cython>=0.29.33
- - meson[ninja]=1.0.1
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- pytest-xdist>=2.2.0
- - pytest-asyncio>=0.17.0
- pytest-localserver>=0.7.1
+ - pytest-qt>=4.2.0
- boto3
# required dependencies
@@ -24,37 +24,38 @@ dependencies:
- pytz
# optional dependencies
- - beautifulsoup4>=4.11.1
- - blosc>=1.21.0
- - bottleneck>=1.3.4
- - brotlipy>=0.7.0
- - fastparquet>=0.8.1
- - fsspec>=2022.05.0
+ - beautifulsoup4>=4.11.2
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2022.12.0
+ - fsspec>=2022.11.0
- html5lib>=1.1
- hypothesis>=6.46.1
- - gcsfs>=2022.05.0
+ - gcsfs>=2022.11.0
- jinja2>=3.1.2
- - lxml>=4.8.0
- - matplotlib>=3.6.1
- # test_numba_vs_cython segfaults with numba 0.57
- - numba>=0.55.2, <0.57.0
- - numexpr>=2.8.0
+ - lxml>=4.9.2
+ - matplotlib>=3.6.3
+ - numba>=0.56.4
+ - numexpr>=2.8.4
- odfpy>=1.4.1
- - qtpy>=2.2.0
- - openpyxl>=3.0.10
- - pandas-gbq>=0.17.5
- - psycopg2>=2.9.3
- - pyarrow>=7.0.0
+ - qtpy>=2.3.0
+ - openpyxl>=3.1.0
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
- pymysql>=1.0.2
- # - pyreadstat>=1.1.5 not available on ARM
- - pytables>=3.7.0
- - python-snappy>=0.6.1
- - pyxlsb>=1.0.9
- - s3fs>=2022.05.0
- - scipy>=1.8.1
- - sqlalchemy>=1.4.36
- - tabulate>=0.8.10
- - xarray>=2022.03.0
+ - pyqt>=5.15.9
+ - pyreadstat>=1.2.0
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pyxlsb>=1.0.10
+ - s3fs>=2022.11.0
+ - scipy>=1.10.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2022.12.0
- xlrd>=2.0.1
- - xlsxwriter>=3.0.3
- - zstandard>=0.17.0
+ - xlsxwriter>=3.0.5
+ - zstandard>=0.19.0
+ - pip:
+ - adbc-driver-postgresql>=0.8.0
+ - adbc-driver-sqlite>=0.8.0
diff --git a/ci/meta.yaml b/ci/meta.yaml
index 77f536a18a204..aac5593e493b7 100644
--- a/ci/meta.yaml
+++ b/ci/meta.yaml
@@ -38,7 +38,7 @@ requirements:
- numpy >=1.23.2 # [py>=311]
- python-dateutil >=2.8.2
- pytz >=2020.1
- - python-tzdata >=2022.1
+ - python-tzdata >=2022.7
test:
imports:
@@ -64,7 +64,6 @@ test:
requires:
- pip
- pytest >=7.3.2
- - pytest-asyncio >=0.17.0
- pytest-xdist >=2.2.0
- pytest-cov
- hypothesis >=6.46.1
diff --git a/doc/_templates/autosummary/class.rst b/doc/_templates/autosummary/class.rst
index a9c9bd2b6507f..79c2e37b0192f 100644
--- a/doc/_templates/autosummary/class.rst
+++ b/doc/_templates/autosummary/class.rst
@@ -1,33 +1,32 @@
-{% extends "!autosummary/class.rst" %}
+{{ fullname | escape | underline}}
-{% block methods %}
-{% if methods %}
+.. currentmodule:: {{ module }}
-..
- HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
- .. autosummary::
- :toctree:
- {% for item in all_methods %}
- {%- if not item.startswith('_') or item in ['__call__'] %}
- {{ name }}.{{ item }}
- {%- endif -%}
- {%- endfor %}
+.. autoclass:: {{ objname }}
-{% endif %}
-{% endblock %}
+ {% block methods %}
-{% block attributes %}
-{% if attributes %}
+ {% block attributes %}
+ {% if attributes %}
+ .. rubric:: {{ _('Attributes') }}
-..
- HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
.. autosummary::
- :toctree:
- {% for item in all_attributes %}
- {%- if not item.startswith('_') %}
- {{ name }}.{{ item }}
- {%- endif -%}
- {%- endfor %}
+ {% for item in attributes %}
+ {% if item in members and not item.startswith('_') %}
+ ~{{ name }}.{{ item }}
+ {% endif %}
+ {%- endfor %}
+ {% endif %}
+ {% endblock %}
+
+ {% if methods %}
+ .. rubric:: {{ _('Methods') }}
-{% endif %}
-{% endblock %}
+ .. autosummary::
+ {% for item in methods %}
+ {% if item in members and (not item.startswith('_') or item in ['__call__']) %}
+ ~{{ name }}.{{ item }}
+ {% endif %}
+ {%- endfor %}
+ {% endif %}
+ {% endblock %}
diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md
new file mode 100644
index 0000000000000..6c33de104ed90
--- /dev/null
+++ b/doc/cheatsheet/README.md
@@ -0,0 +1,22 @@
+# Pandas Cheat Sheet
+
+The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
+To create the PDF version, within Powerpoint, simply do a "Save As"
+and pick "PDF" as the format.
+
+This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf).
+
+| Topic | PDF | PPT |
+|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas_Cheat_Sheet | | |
+| Pandas_Cheat_Sheet_JA | | |
+
+
+**Alternative**
+
+Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets
+developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats.
+
+| Topic | PDF | Streamlit | Google Colab |
+|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas | | | |
diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt
deleted file mode 100644
index c57da38b31777..0000000000000
--- a/doc/cheatsheet/README.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
-To create the PDF version, within Powerpoint, simply do a "Save As"
-and pick "PDF" as the format.
-
-This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2].
-
-[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
-[2]: https://www.princetonoptimization.com/
diff --git a/doc/make.py b/doc/make.py
index 937b2638fb098..2583242786fc8 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -45,12 +45,14 @@ def __init__(
single_doc=None,
verbosity=0,
warnings_are_errors=False,
+ no_browser=False,
) -> None:
self.num_jobs = num_jobs
self.include_api = include_api
self.whatsnew = whatsnew
self.verbosity = verbosity
self.warnings_are_errors = warnings_are_errors
+ self.no_browser = no_browser
if single_doc:
single_doc = self._process_single_doc(single_doc)
@@ -100,7 +102,7 @@ def _process_single_doc(self, single_doc):
)
@staticmethod
- def _run_os(*args):
+ def _run_os(*args) -> None:
"""
Execute a command as a OS terminal.
@@ -123,14 +125,14 @@ def _sphinx_build(self, kind: str):
Parameters
----------
- kind : {'html', 'latex'}
+ kind : {'html', 'latex', 'linkcheck'}
Examples
--------
>>> DocBuilder(num_jobs=4)._sphinx_build('html')
"""
- if kind not in ("html", "latex"):
- raise ValueError(f"kind must be html or latex, not {kind}")
+ if kind not in ("html", "latex", "linkcheck"):
+ raise ValueError(f"kind must be html, latex or linkcheck, not {kind}")
cmd = ["sphinx-build", "-b", kind]
if self.num_jobs:
@@ -147,7 +149,7 @@ def _sphinx_build(self, kind: str):
]
return subprocess.call(cmd)
- def _open_browser(self, single_doc_html):
+ def _open_browser(self, single_doc_html) -> None:
"""
Open a browser tab showing single
"""
@@ -159,10 +161,10 @@ def _get_page_title(self, page):
Open the rst file `page` and extract its title.
"""
fname = os.path.join(SOURCE_PATH, f"{page}.rst")
- option_parser = docutils.frontend.OptionParser(
- components=(docutils.parsers.rst.Parser,)
+ doc = docutils.utils.new_document(
+ "",
+ docutils.frontend.get_default_settings(docutils.parsers.rst.Parser),
)
- doc = docutils.utils.new_document("", option_parser.get_default_values())
with open(fname, encoding="utf-8") as f:
data = f.read()
@@ -181,7 +183,7 @@ def _get_page_title(self, page):
return title.astext()
- def _add_redirects(self):
+ def _add_redirects(self) -> None:
"""
Create in the build directory an html file with a redirect,
for every row in REDIRECTS_FILE.
@@ -235,10 +237,11 @@ def html(self):
if ret_code == 0:
if self.single_doc_html is not None:
- self._open_browser(self.single_doc_html)
+ if not self.no_browser:
+ self._open_browser(self.single_doc_html)
else:
self._add_redirects()
- if self.whatsnew:
+ if self.whatsnew and not self.no_browser:
self._open_browser(os.path.join("whatsnew", "index.html"))
return ret_code
@@ -269,14 +272,14 @@ def latex_forced(self):
return self.latex(force=True)
@staticmethod
- def clean():
+ def clean() -> None:
"""
Clean documentation generated files.
"""
shutil.rmtree(BUILD_PATH, ignore_errors=True)
shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True)
- def zip_html(self):
+ def zip_html(self) -> None:
"""
Compress HTML documentation into a zip file.
"""
@@ -288,6 +291,12 @@ def zip_html(self):
os.chdir(dirname)
self._run_os("zip", zip_fname, "-r", "-q", *fnames)
+ def linkcheck(self):
+ """
+ Check for broken links in the documentation.
+ """
+ return self._sphinx_build("linkcheck")
+
def main():
cmds = [method for method in dir(DocBuilder) if not method.startswith("_")]
@@ -343,6 +352,12 @@ def main():
action="store_true",
help="fail if warnings are raised",
)
+ argparser.add_argument(
+ "--no-browser",
+ help="Don't open browser",
+ default=False,
+ action="store_true",
+ )
args = argparser.parse_args()
if args.command not in cmds:
@@ -368,6 +383,7 @@ def main():
args.single,
args.verbosity,
args.warnings_are_errors,
+ args.no_browser,
)
return getattr(builder, args.command)()
diff --git a/doc/redirects.csv b/doc/redirects.csv
index 97cd20b295e65..27b41da63c513 100644
--- a/doc/redirects.csv
+++ b/doc/redirects.csv
@@ -100,7 +100,6 @@ generated/pandas.api.extensions.register_series_accessor,../reference/api/pandas
generated/pandas.api.types.infer_dtype,../reference/api/pandas.api.types.infer_dtype
generated/pandas.api.types.is_bool_dtype,../reference/api/pandas.api.types.is_bool_dtype
generated/pandas.api.types.is_bool,../reference/api/pandas.api.types.is_bool
-generated/pandas.api.types.is_categorical_dtype,../reference/api/pandas.api.types.is_categorical_dtype
generated/pandas.api.types.is_complex_dtype,../reference/api/pandas.api.types.is_complex_dtype
generated/pandas.api.types.is_complex,../reference/api/pandas.api.types.is_complex
generated/pandas.api.types.is_datetime64_any_dtype,../reference/api/pandas.api.types.is_datetime64_any_dtype
@@ -127,7 +126,6 @@ generated/pandas.api.types.is_number,../reference/api/pandas.api.types.is_number
generated/pandas.api.types.is_numeric_dtype,../reference/api/pandas.api.types.is_numeric_dtype
generated/pandas.api.types.is_object_dtype,../reference/api/pandas.api.types.is_object_dtype
generated/pandas.api.types.is_period_dtype,../reference/api/pandas.api.types.is_period_dtype
-generated/pandas.api.types.is_period,../reference/api/pandas.api.types.is_period
generated/pandas.api.types.is_re_compilable,../reference/api/pandas.api.types.is_re_compilable
generated/pandas.api.types.is_re,../reference/api/pandas.api.types.is_re
generated/pandas.api.types.is_scalar,../reference/api/pandas.api.types.is_scalar
@@ -194,40 +192,39 @@ generated/pandas.core.groupby.DataFrameGroupBy.shift,../reference/api/pandas.cor
generated/pandas.core.groupby.DataFrameGroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size
generated/pandas.core.groupby.DataFrameGroupBy.skew,../reference/api/pandas.core.groupby.DataFrameGroupBy.skew
generated/pandas.core.groupby.DataFrameGroupBy.take,../reference/api/pandas.core.groupby.DataFrameGroupBy.take
-generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.GroupBy.agg
-generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.GroupBy.aggregate
-generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.GroupBy.all
-generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.GroupBy.any
-generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.GroupBy.apply
-generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.GroupBy.bfill
-generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.GroupBy.count
-generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.GroupBy.cumcount
-generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.GroupBy.ffill
-generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.GroupBy.first
-generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.GroupBy.get_group
-generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.GroupBy.groups
-generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.GroupBy.head
-generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.GroupBy.indices
-generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.GroupBy.__iter__
-generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.GroupBy.last
-generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.GroupBy.max
-generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.GroupBy.mean
-generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.GroupBy.median
-generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.GroupBy.min
-generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.GroupBy.ngroup
-generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.GroupBy.nth
-generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.GroupBy.ohlc
-generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.GroupBy.pct_change
-generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.GroupBy.pipe
-generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.GroupBy.prod
-generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.GroupBy.rank
-generated/pandas.core.groupby.GroupBy.sem,../reference/api/pandas.core.groupby.GroupBy.sem
-generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.GroupBy.size
-generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.GroupBy.std
-generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.GroupBy.sum
-generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.GroupBy.tail
-generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.GroupBy.transform
-generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.GroupBy.var
+generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.DataFrameGroupBy.agg
+generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate
+generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.DataFrameGroupBy.all
+generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.DataFrameGroupBy.any
+generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.DataFrameGroupBy.apply
+generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.DataFrameGroupBy.bfill
+generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.DataFrameGroupBy.count
+generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.DataFrameGroupBy.cumcount
+generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.DataFrameGroupBy.ffill
+generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.DataFrameGroupBy.first
+generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.DataFrameGroupBy.get_group
+generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.DataFrameGroupBy.groups
+generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.DataFrameGroupBy.head
+generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.DataFrameGroupBy.indices
+generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.DataFrameGroupBy.__iter__
+generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.DataFrameGroupBy.last
+generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.DataFrameGroupBy.max
+generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.DataFrameGroupBy.mean
+generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.DataFrameGroupBy.median
+generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.DataFrameGroupBy.min
+generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.DataFrameGroupBy.ngroup
+generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.DataFrameGroupBy.nth
+generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.DataFrameGroupBy.ohlc
+generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.DataFrameGroupBy.pct_change
+generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.DataFrameGroupBy.pipe
+generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.DataFrameGroupBy.prod
+generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.DataFrameGroupBy.rank
+generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size
+generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.DataFrameGroupBy.std
+generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.DataFrameGroupBy.sum
+generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.DataFrameGroupBy.tail
+generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.DataFrameGroupBy.transform
+generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.DataFrameGroupBy.var
generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing
generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing
generated/pandas.core.groupby.SeriesGroupBy.nlargest,../reference/api/pandas.core.groupby.SeriesGroupBy.nlargest
@@ -238,7 +235,7 @@ generated/pandas.core.groupby.SeriesGroupBy.value_counts,../reference/api/pandas
generated/pandas.core.resample.Resampler.aggregate,../reference/api/pandas.core.resample.Resampler.aggregate
generated/pandas.core.resample.Resampler.apply,../reference/api/pandas.core.resample.Resampler.apply
generated/pandas.core.resample.Resampler.asfreq,../reference/api/pandas.core.resample.Resampler.asfreq
-generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.backfill
+generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.bfill
generated/pandas.core.resample.Resampler.bfill,../reference/api/pandas.core.resample.Resampler.bfill
generated/pandas.core.resample.Resampler.count,../reference/api/pandas.core.resample.Resampler.count
generated/pandas.core.resample.Resampler.ffill,../reference/api/pandas.core.resample.Resampler.ffill
@@ -257,7 +254,6 @@ generated/pandas.core.resample.Resampler.min,../reference/api/pandas.core.resamp
generated/pandas.core.resample.Resampler.nearest,../reference/api/pandas.core.resample.Resampler.nearest
generated/pandas.core.resample.Resampler.nunique,../reference/api/pandas.core.resample.Resampler.nunique
generated/pandas.core.resample.Resampler.ohlc,../reference/api/pandas.core.resample.Resampler.ohlc
-generated/pandas.core.resample.Resampler.pad,../reference/api/pandas.core.resample.Resampler.pad
generated/pandas.core.resample.Resampler.pipe,../reference/api/pandas.core.resample.Resampler.pipe
generated/pandas.core.resample.Resampler.prod,../reference/api/pandas.core.resample.Resampler.prod
generated/pandas.core.resample.Resampler.quantile,../reference/api/pandas.core.resample.Resampler.quantile
@@ -709,7 +705,7 @@ generated/pandas.Index.summary,../reference/api/pandas.Index.summary
generated/pandas.Index.symmetric_difference,../reference/api/pandas.Index.symmetric_difference
generated/pandas.Index.take,../reference/api/pandas.Index.take
generated/pandas.Index.T,../reference/api/pandas.Index.T
-generated/pandas.Index.to_flat_index,../reference/api/pandas.Index.to_flat_index
+generated/pandas.Index.to_flat_index,../reference/api/pandas.MultiIndex.to_flat_index
generated/pandas.Index.to_frame,../reference/api/pandas.Index.to_frame
generated/pandas.Index.to_list,../reference/api/pandas.Index.to_list
generated/pandas.Index.tolist,../reference/api/pandas.Index.tolist
@@ -754,7 +750,8 @@ generated/pandas.Interval.overlaps,../reference/api/pandas.Interval.overlaps
generated/pandas.interval_range,../reference/api/pandas.interval_range
generated/pandas.Interval.right,../reference/api/pandas.Interval.right
generated/pandas.io.formats.style.Styler.apply,../reference/api/pandas.io.formats.style.Styler.apply
-generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.applymap
+generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.map
+generated/pandas.io.formats.style.Styler.applymap_index,../reference/api/pandas.io.formats.style.Styler.map_index
generated/pandas.io.formats.style.Styler.background_gradient,../reference/api/pandas.io.formats.style.Styler.background_gradient
generated/pandas.io.formats.style.Styler.bar,../reference/api/pandas.io.formats.style.Styler.bar
generated/pandas.io.formats.style.Styler.clear,../reference/api/pandas.io.formats.style.Styler.clear
@@ -1385,3 +1382,69 @@ generated/pandas.wide_to_long,../reference/api/pandas.wide_to_long
# Cached searches
reference/api/pandas.DataFrame.from_csv,pandas.read_csv
+
+# GroupBy -> DataFrameGroupBy
+reference/api/pandas.core.groupby.GroupBy.__iter__,pandas.core.groupby.DataFrameGroupBy.__iter__
+reference/api/pandas.core.groupby.GroupBy.agg,pandas.core.groupby.DataFrameGroupBy.agg
+reference/api/pandas.core.groupby.GroupBy.aggregate,pandas.core.groupby.DataFrameGroupBy.aggregate
+reference/api/pandas.core.groupby.GroupBy.all,pandas.core.groupby.DataFrameGroupBy.all
+reference/api/pandas.core.groupby.GroupBy.any,pandas.core.groupby.DataFrameGroupBy.any
+reference/api/pandas.core.groupby.GroupBy.apply,pandas.core.groupby.DataFrameGroupBy.apply
+reference/api/pandas.core.groupby.GroupBy.bfill,pandas.core.groupby.DataFrameGroupBy.bfill
+reference/api/pandas.core.groupby.GroupBy.count,pandas.core.groupby.DataFrameGroupBy.count
+reference/api/pandas.core.groupby.GroupBy.cumcount,pandas.core.groupby.DataFrameGroupBy.cumcount
+reference/api/pandas.core.groupby.GroupBy.cummax,pandas.core.groupby.DataFrameGroupBy.cummax
+reference/api/pandas.core.groupby.GroupBy.cummin,pandas.core.groupby.DataFrameGroupBy.cummin
+reference/api/pandas.core.groupby.GroupBy.cumprod,pandas.core.groupby.DataFrameGroupBy.cumprod
+reference/api/pandas.core.groupby.GroupBy.cumsum,pandas.core.groupby.DataFrameGroupBy.cumsum
+reference/api/pandas.core.groupby.GroupBy.ffill,pandas.core.groupby.DataFrameGroupBy.ffill
+reference/api/pandas.core.groupby.GroupBy.first,pandas.core.groupby.DataFrameGroupBy.first
+reference/api/pandas.core.groupby.GroupBy.get_group,pandas.core.groupby.DataFrameGroupBy.get_group
+reference/api/pandas.core.groupby.GroupBy.groups,pandas.core.groupby.DataFrameGroupBy.groups
+reference/api/pandas.core.groupby.GroupBy.head,pandas.core.groupby.DataFrameGroupBy.head
+reference/api/pandas.core.groupby.GroupBy.indices,pandas.core.groupby.DataFrameGroupBy.indices
+reference/api/pandas.core.groupby.GroupBy.last,pandas.core.groupby.DataFrameGroupBy.last
+reference/api/pandas.core.groupby.GroupBy.max,pandas.core.groupby.DataFrameGroupBy.max
+reference/api/pandas.core.groupby.GroupBy.mean,pandas.core.groupby.DataFrameGroupBy.mean
+reference/api/pandas.core.groupby.GroupBy.median,pandas.core.groupby.DataFrameGroupBy.median
+reference/api/pandas.core.groupby.GroupBy.min,pandas.core.groupby.DataFrameGroupBy.min
+reference/api/pandas.core.groupby.GroupBy.ngroup,pandas.core.groupby.DataFrameGroupBy.ngroup
+reference/api/pandas.core.groupby.GroupBy.nth,pandas.core.groupby.DataFrameGroupBy.nth
+reference/api/pandas.core.groupby.GroupBy.ohlc,pandas.core.groupby.DataFrameGroupBy.ohlc
+reference/api/pandas.core.groupby.GroupBy.pct_change,pandas.core.groupby.DataFrameGroupBy.pct_change
+reference/api/pandas.core.groupby.GroupBy.pipe,pandas.core.groupby.DataFrameGroupBy.pipe
+reference/api/pandas.core.groupby.GroupBy.prod,pandas.core.groupby.DataFrameGroupBy.prod
+reference/api/pandas.core.groupby.GroupBy.rank,pandas.core.groupby.DataFrameGroupBy.rank
+reference/api/pandas.core.groupby.GroupBy.sem,pandas.core.groupby.DataFrameGroupBy.sem
+reference/api/pandas.core.groupby.GroupBy.size,pandas.core.groupby.DataFrameGroupBy.size
+reference/api/pandas.core.groupby.GroupBy.std,pandas.core.groupby.DataFrameGroupBy.std
+reference/api/pandas.core.groupby.GroupBy.sum,pandas.core.groupby.DataFrameGroupBy.sum
+reference/api/pandas.core.groupby.GroupBy.tail,pandas.core.groupby.DataFrameGroupBy.tail
+reference/api/pandas.core.groupby.GroupBy.transform,pandas.core.groupby.DataFrameGroupBy.transform
+reference/api/pandas.core.groupby.GroupBy.var,pandas.core.groupby.DataFrameGroupBy.var
+
+# Renamed or alias doc page was removed
+reference/api/pandas.DataFrame.subtract,pandas.DataFrame.sub
+reference/api/pandas.DataFrame.multiply,pandas.DataFrame.mul
+reference/api/pandas.DataFrame.divide,pandas.DataFrame.div
+reference/api/pandas.Series.subtract,pandas.Series.sub
+reference/api/pandas.Series.multiply,pandas.Series.mul
+reference/api/pandas.Series.divide,pandas.Series.div
+reference/api/pandas.Series.tolist,pandas.Series.to_list
+reference/api/pandas.Series.transpose,pandas.Series.T
+reference/api/pandas.Index.transpose,pandas.Index.T
+reference/api/pandas.Index.notnull,pandas.Index.notna
+reference/api/pandas.Index.tolist,pandas.Index.to_list
+reference/api/pandas.arrays.PandasArray,pandas.arrays.NumpyExtensionArray
+reference/api/pandas.core.groupby.DataFrameGroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill
+reference/api/pandas.core.groupby.GroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill
+reference/api/pandas.core.resample.Resampler.backfill,pandas.core.resample.Resampler.bfill
+reference/api/pandas.io.formats.style.Styler.applymap,pandas.io.formats.style.Styler.map
+reference/api/pandas.io.formats.style.Styler.applymap_index,pandas.io.formats.style.Styler.map_index
+
+# EWM -> ExponentialMovingWindow
+reference/api/pandas.core.window.ewm.EWM.corr,pandas.core.window.ewm.ExponentialMovingWindow.corr
+reference/api/pandas.core.window.ewm.EWM.cov,pandas.core.window.ewm.ExponentialMovingWindow.cov
+reference/api/pandas.core.window.ewm.EWM.mean,pandas.core.window.ewm.ExponentialMovingWindow.mean
+reference/api/pandas.core.window.ewm.EWM.std,pandas.core.window.ewm.ExponentialMovingWindow.std
+reference/api/pandas.core.window.ewm.EWM.var,pandas.core.window.ewm.ExponentialMovingWindow.var
diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py
index 559689ef5915b..dded8362a4cda 100644
--- a/doc/scripts/eval_performance.py
+++ b/doc/scripts/eval_performance.py
@@ -64,7 +64,7 @@ def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
return ev, qu
-def plot_perf(df, engines, title, filename=None):
+def plot_perf(df, engines, title, filename=None) -> None:
from matplotlib.pyplot import figure
sns.set()
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 0d69e030f913a..be6150d4e54ba 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -14,6 +14,7 @@
import inspect
import logging
import os
+import re
import sys
import warnings
@@ -57,7 +58,6 @@
"numpydoc",
"sphinx_copybutton",
"sphinx_design",
- "sphinx_toggleprompt",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.coverage",
@@ -76,7 +76,6 @@
# to ensure that include files (partial pages) aren't built, exclude them
# https://github.com/sphinx-doc/sphinx/issues/1965#issuecomment-124732907
"**/includes/**",
- "**/api/pandas.Series.dt.rst",
]
try:
import nbconvert
@@ -130,6 +129,8 @@
autodoc_typehints = "none"
# numpydoc
+numpydoc_show_class_members = False
+numpydoc_show_inherited_class_members = False
numpydoc_attributes_as_param_list = False
# matplotlib plot directive
@@ -161,7 +162,7 @@
# General information about the project.
project = "pandas"
# We have our custom "pandas_footer.html" template, using copyright for the current year
-copyright = f"{datetime.now().year}"
+copyright = f"{datetime.now().year},"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -229,24 +230,31 @@
# further. For a list of options available for each theme, see the
# documentation.
-switcher_version = version
if ".dev" in version:
switcher_version = "dev"
elif "rc" in version:
switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)"
+else:
+ # only keep major.minor version number to match versions.json
+ switcher_version = ".".join(version.split(".")[:2])
html_theme_options = {
"external_links": [],
"footer_start": ["pandas_footer", "sphinx-version"],
"github_url": "https://github.com/pandas-dev/pandas",
"twitter_url": "https://twitter.com/pandas_dev",
- "analytics": {"google_analytics_id": "G-5RE31C1RNW"},
+ "analytics": {
+ "plausible_analytics_domain": "pandas.pydata.org",
+ "plausible_analytics_url": "https://views.scientific-python.org/js/script.js",
+ },
"logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"},
+ "navbar_align": "left",
"navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"],
"switcher": {
"json_url": "https://pandas.pydata.org/versions.json",
"version_match": switcher_version,
},
+ "show_version_warning_banner": True,
"icon_links": [
{
"name": "Mastodon",
@@ -452,7 +460,6 @@
"dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
"matplotlib": ("https://matplotlib.org/stable/", None),
"numpy": ("https://numpy.org/doc/stable/", None),
- "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None),
"py": ("https://pylib.readthedocs.io/en/latest/", None),
"python": ("https://docs.python.org/3/", None),
"scipy": ("https://docs.scipy.org/doc/scipy/", None),
@@ -499,7 +506,7 @@ class AccessorDocumenter(MethodDocumenter):
# lower than MethodDocumenter so this is not chosen for normal methods
priority = 0.6
- def format_signature(self):
+ def format_signature(self) -> str:
# this method gives an error/warning for the accessors, therefore
# overriding it (accessor has no arguments)
return ""
@@ -629,7 +636,7 @@ def get_items(self, names):
# based on numpy doc/source/conf.py
-def linkcode_resolve(domain, info):
+def linkcode_resolve(domain, info) -> str | None:
"""
Determine the URL corresponding to Python object
"""
@@ -691,12 +698,12 @@ def linkcode_resolve(domain, info):
# remove the docstring of the flags attribute (inherited from numpy ndarray)
# because these give doc build errors (see GH issue 5331)
-def remove_flags_docstring(app, what, name, obj, options, lines):
+def remove_flags_docstring(app, what, name, obj, options, lines) -> None:
if what == "attribute" and name.endswith(".flags"):
del lines[:]
-def process_class_docstrings(app, what, name, obj, options, lines):
+def process_class_docstrings(app, what, name, obj, options, lines) -> None:
"""
For those classes for which we use ::
@@ -748,7 +755,7 @@ def process_class_docstrings(app, what, name, obj, options, lines):
]
-def process_business_alias_docstrings(app, what, name, obj, options, lines):
+def process_business_alias_docstrings(app, what, name, obj, options, lines) -> None:
"""
Starting with sphinx 3.4, the "autodoc-process-docstring" event also
gets called for alias classes. This results in numpydoc adding the
@@ -771,7 +778,7 @@ def process_business_alias_docstrings(app, what, name, obj, options, lines):
suppress_warnings.append("ref.ref")
-def rstjinja(app, docname, source):
+def rstjinja(app, docname, source) -> None:
"""
Render our pages as a jinja template for fancy templating goodness.
"""
@@ -784,7 +791,7 @@ def rstjinja(app, docname, source):
source[0] = rendered
-def setup(app):
+def setup(app) -> None:
app.connect("source-read", rstjinja)
app.connect("autodoc-process-docstring", remove_flags_docstring)
app.connect("autodoc-process-docstring", process_class_docstrings)
@@ -794,3 +801,49 @@ def setup(app):
app.add_autodocumenter(AccessorMethodDocumenter)
app.add_autodocumenter(AccessorCallableDocumenter)
app.add_directive("autosummary", PandasAutosummary)
+
+
+# Ignore list for broken links,found in CI run checks for broken-linkcheck.yml
+
+linkcheck_ignore = [
+ "^http://$",
+ "^https://$",
+ *[
+ re.escape(link)
+ for link in [
+ "http://scatterci.github.io/pydata/pandas",
+ "http://specs.frictionlessdata.io/json-table-schema/",
+ "https://cloud.google.com/bigquery/docs/access-control#roles",
+ "https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query",
+ "https://crates.io/crates/calamine",
+ "https://devguide.python.org/setup/#macos",
+ "https://en.wikipedia.org/wiki/Imputation_statistics",
+ "https://en.wikipedia.org/wiki/Imputation_(statistics",
+ "https://github.com/noatamir/pandas-dev",
+ "https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1",
+ "https://github.com/pandas-dev/pandas/blob/v0.20.2/pandas/core/generic.py#L568",
+ "https://github.com/pandas-dev/pandas/blob/v0.20.2/pandas/core/frame.py#L1495",
+ "https://github.com/pandas-dev/pandas/issues/174151",
+ "https://gitpod.io/#https://github.com/USERNAME/pandas",
+ "https://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/",
+ "https://matplotlib.org/api/axes_api.html#matplotlib.axes.Axes.table",
+ "https://nipunbatra.github.io/blog/visualisation/2013/05/01/aggregation-timeseries.html",
+ "https://nbviewer.ipython.org/gist/metakermit/5720498",
+ "https://numpy.org/doc/stable/user/basics.byteswapping.html",
+ "https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-8-0",
+ "https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking",
+ "https://pandas.pydata.org/pandas-docs/stable/ecosystem.html",
+ "https://sqlalchemy.readthedocs.io/en/latest/dialects/index.html",
+ "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245912.htm",
+ "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000214639.htm",
+ "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002283942.htm",
+ "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245965.htm",
+ "https://support.sas.com/documentation/cdl/en/imlug/66845/HTML/default/viewer.htm#imlug_langref_sect455.htm",
+ "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002284668.htm",
+ "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002978282.htm",
+ "https://wesmckinney.com/blog/update-on-upcoming-pandas-v0-10-new-file-parser-other-performance-wins/",
+ "https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022",
+ "pandas.zip",
+ ]
+ ],
+]
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index 247f5cb0cb62d..82af8122a6bbd 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -311,6 +311,7 @@ If using :ref:`mamba `, run:
.. code-block:: shell
git checkout main
+ git fetch upstream
git merge upstream/main
mamba activate pandas-dev
mamba env update -f environment.yml --prune
@@ -320,6 +321,7 @@ If using :ref:`pip ` , do:
.. code-block:: shell
git checkout main
+ git fetch upstream
git merge upstream/main
# activate the virtual environment based on your platform
python -m pip install --upgrade -r requirements-dev.txt
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index c06c0f8703d11..be8249cd3a287 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -39,7 +39,7 @@ Pre-commit
Additionally, :ref:`Continuous Integration ` will run code formatting checks
like ``black``, ``ruff``,
-``isort``, and ``cpplint`` and more using `pre-commit hooks `_.
+``isort``, and ``clang-format`` and more using `pre-commit hooks `_.
Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore,
it is helpful to run the check yourself before submitting code. This
can be done by installing ``pre-commit`` (which should already have happened if you followed the instructions
@@ -456,6 +456,12 @@ be located.
- tests.io
+ .. note::
+
+ This includes ``to_string`` but excludes ``__repr__``, which is
+ tested in ``tests.frame.test_repr`` and ``tests.series.test_repr``.
+ Other classes often have a ``test_formats`` file.
+
C) Otherwise
This test likely belongs in one of:
@@ -528,7 +534,7 @@ If a test is known to fail but the manner in which it fails
is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that
exhibits buggy behavior or a non-implemented feature. If
the failing test has flaky behavior, use the argument ``strict=False``. This
-will make it so pytest does not fail if the test happens to pass.
+will make it so pytest does not fail if the test happens to pass. Using ``strict=False`` is highly undesirable, please use it only as a last resort.
Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param``
over usage within a test so that the test is appropriately marked during the
@@ -540,7 +546,7 @@ xfail during the testing phase. To do so, use the ``request`` fixture:
def test_xfail(request):
mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here")
- request.node.add_marker(mark)
+ request.applymarker(mark)
xfail is not to be used for tests involving failure due to invalid user arguments.
For these tests, we need to verify the correct exception type and error message
@@ -754,7 +760,7 @@ install pandas) by typing::
your installation is probably fine and you can start contributing!
Often it is worth running only a subset of tests first around your changes before running the
-entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage-12d2130077bc.herokuapp.com/))
+entire suite (tip: you can use the `pandas-coverage app `_)
to find out which tests hit the lines of code you've modified, and then run only those).
The easiest way to do this is with::
diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index 51d0edf1859c5..325c902dd4f9e 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -44,8 +44,9 @@ and consult the ``Linux`` instructions below.
**macOS**
To use the :ref:`mamba `-based compilers, you will need to install the
-Developer Tools using ``xcode-select --install``. Otherwise
-information about compiler installation can be found here:
+Developer Tools using ``xcode-select --install``.
+
+If you prefer to use a different compiler, general information can be found here:
https://devguide.python.org/setup/#macos
**Linux**
@@ -86,12 +87,12 @@ Before we begin, please:
Option 1: using mamba (recommended)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-* Install `mamba `_
+* Install miniforge to get `mamba `_
* Make sure your mamba is up to date (``mamba update mamba``)
+* Create and activate the ``pandas-dev`` mamba environment using the following commands:
.. code-block:: none
- # Create and activate the build environment
mamba env create --file environment.yml
mamba activate pandas-dev
@@ -214,7 +215,7 @@ due to limitations in setuptools.
The newer build system, invokes the meson backend through pip (via a `PEP 517 `_ build).
It automatically uses all available cores on your CPU, and also avoids the need for manual rebuilds by
-rebuilding automatically whenever pandas is imported(with an editable install).
+rebuilding automatically whenever pandas is imported (with an editable install).
For these reasons, you should compile pandas with meson.
Because the meson build system is newer, you may find bugs/minor issues as it matures. You can report these bugs
@@ -228,6 +229,14 @@ To compile pandas with meson, run::
# If you do not want to see this, omit everything after --no-build-isolation
python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
+.. note::
+ The version number is pulled from the latest repository tag. Be sure to fetch the latest tags from upstream
+ before building::
+
+ # set the upstream repository, if not done already, and fetch the latest tags
+ git remote add upstream https://github.com/pandas-dev/pandas.git
+ git fetch upstream --tags
+
**Build options**
It is possible to pass options from the pip frontend to the meson backend if you would like to configure your
@@ -265,6 +274,8 @@ uses to import the extension from the build folder, which may cause errors such
You will need to repeat this step each time the C extensions change, for example
if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``.
+**Checking the build**
+
At this point you should be able to import pandas from your locally built version::
$ python
@@ -272,6 +283,12 @@ At this point you should be able to import pandas from your locally built versio
>>> print(pandas.__version__) # note: the exact output may differ
2.0.0.dev0+880.g2b9e661fbb.dirty
+
+At this point you may want to try
+`running the test suite `_.
+
+**Keeping up to date with the latest build**
+
When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified.
By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson's
output when importing pandas, you can set the environment variable ``MESONPY_EDTIABLE_VERBOSE``. For example, this would be::
diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst
index 9ac4cf4083475..d63ecb3157cff 100644
--- a/doc/source/development/debugging_extensions.rst
+++ b/doc/source/development/debugging_extensions.rst
@@ -14,19 +14,49 @@ For Python developers with limited or no C/C++ experience this can seem a daunti
2. `Fundamental Python Debugging Part 2 - Python Extensions `_
3. `Fundamental Python Debugging Part 3 - Cython Extensions `_
-Generating debug builds
------------------------
+Debugging locally
+-----------------
By default building pandas from source will generate a release build. To generate a development build you can type::
pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug"
+.. note::
+
+ conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases. If using conda, you may need to set ``CFLAGS="$CFLAGS -O0"`` and ``CPPFLAGS="$CPPFLAGS -O0"`` to ensure optimizations are turned off for debugging
+
By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types.
+Using Docker
+------------
+
+To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locallly.
+
+You can then mount your pandas repository into this image via:
+
+.. code-block:: sh
+
+ docker run --rm -it -w /data -v ${PWD}:/data pandas/pandas-debug
+
+Inside the image, you can use meson to build/install pandas and place the build artifacts into a ``debug`` folder using a command as follows:
+
+.. code-block:: sh
+
+ python -m pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug"
+
+If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first ``cd`` to the build folder, then start that application.
+
+.. code-block:: sh
+
+ cd debug
+ cygdb
+
+Within the debugger you can use `cygdb commands `_ to navigate cython extensions.
+
Editor support
--------------
-The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-defintion and error checking support as you type.
+The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-definition and error checking support as you type.
How each language server / IDE chooses to look for the compilation database may vary. When in doubt you may want to create a symlink at the root of the project that points to the compilation database in your build directory. Assuming you used *debug* as your directory name, you can run::
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index f74eacb6b861d..e67829b8805eb 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -99,7 +99,7 @@ The interface consists of two classes.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A :class:`pandas.api.extensions.ExtensionDtype` is similar to a ``numpy.dtype`` object. It describes the
-data type. Implementors are responsible for a few unique items like the name.
+data type. Implementers are responsible for a few unique items like the name.
One particularly important item is the ``type`` property. This should be the
class that is the scalar type for your data. For example, if you were writing an
diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst
index b49c9644e1b2a..c7803d8401e4e 100644
--- a/doc/source/development/maintaining.rst
+++ b/doc/source/development/maintaining.rst
@@ -44,6 +44,9 @@ reading.
Issue triage
------------
+Triage is an important first step in addressing issues reported by the community, and even
+partial contributions are a great way to help maintain pandas. Only remove the "Needs Triage"
+tag once all of the steps below have been completed.
Here's a typical workflow for triaging a newly opened issue.
@@ -67,9 +70,9 @@ Here's a typical workflow for triaging a newly opened issue.
3. **Is this a duplicate issue?**
We have many open issues. If a new issue is clearly a duplicate, label the
- new issue as "Duplicate" assign the milestone "No Action", and close the issue
- with a link to the original issue. Make sure to still thank the reporter, and
- encourage them to chime in on the original issue, and perhaps try to fix it.
+ new issue as "Duplicate" and close the issue with a link to the original issue.
+ Make sure to still thank the reporter, and encourage them to chime in on the
+ original issue, and perhaps try to fix it.
If the new issue provides relevant information, such as a better or slightly
different example, add it to the original issue as a comment or an edit to
@@ -90,6 +93,10 @@ Here's a typical workflow for triaging a newly opened issue.
If a reproducible example is provided, but you see a simplification,
edit the original post with your simpler reproducible example.
+ Ensure the issue exists on the main branch and that it has the "Needs Triage" tag
+ until all steps have been completed. Add a comment to the issue once you have
+ verified it exists on the main branch, so others know it has been confirmed.
+
5. **Is this a clearly defined feature request?**
Generally, pandas prefers to discuss and design new features in issues, before
@@ -97,8 +104,9 @@ Here's a typical workflow for triaging a newly opened issue.
for the new feature. Having them write a full docstring is a good way to
pin down specifics.
- We'll need a discussion from several pandas maintainers before deciding whether
- the proposal is in scope for pandas.
+ Tag new feature requests with "Needs Discussion", as we'll need a discussion
+ from several pandas maintainers before deciding whether the proposal is in
+ scope for pandas.
6. **Is this a usage question?**
@@ -117,10 +125,6 @@ Here's a typical workflow for triaging a newly opened issue.
If the issue is clearly defined and the fix seems relatively straightforward,
label the issue as "Good first issue".
- Typically, new issues will be assigned the "Contributions welcome" milestone,
- unless it's know that this issue should be addressed in a specific release (say
- because it's a large regression).
-
Once you have completed the above, make sure to remove the "needs triage" label.
.. _maintaining.regressions:
@@ -445,9 +449,13 @@ which will be triggered when the tag is pushed.
git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0"
git push upstream main --follow-tags
-3. Build the source distribution (git must be in the tag commit)::
+3. Download the source distribution and wheels from the `wheel staging area `_.
+ Be careful to make sure that no wheels are missing (e.g. due to failed builds).
+
+ Running scripts/download_wheels.sh with the version that you want to download wheels/the sdist for should do the trick.
+ This script will make a ``dist`` folder inside your clone of pandas and put the downloaded wheels and sdist there::
- ./setup.py sdist --formats=gztar --quiet
+ scripts/download_wheels.sh
4. Create a `new GitHub release `_:
@@ -459,23 +467,19 @@ which will be triggered when the tag is pushed.
- Set as the latest release: Leave checked, unless releasing a patch release for an older version
(e.g. releasing 1.4.5 after 1.5 has been released)
-5. The GitHub release will after some hours trigger an
+5. Upload wheels to PyPI::
+
+ twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing
+
+6. The GitHub release will after some hours trigger an
`automated conda-forge PR `_.
+ (If you don't want to wait, you can open an issue titled ``@conda-forge-admin, please update version`` to trigger the bot.)
Merge it once the CI is green, and it will generate the conda-forge packages.
+
In case a manual PR needs to be done, the version, sha256 and build fields are the
ones that usually need to be changed. If anything else in the recipe has changed since
the last release, those changes should be available in ``ci/meta.yaml``.
-6. Packages for supported versions in PyPI are built automatically from our CI.
- Once all packages are build download all wheels from the
- `Anaconda repository >`_
- where our CI published them to the ``dist/`` directory in your local pandas copy.
- You can use the script ``scripts/download_wheels.sh`` to download all wheels at once.
-
-7. Upload wheels to PyPI::
-
- twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing
-
Post-Release
````````````
diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
index 7a83d50416186..daa528c7d408a 100644
--- a/doc/source/getting_started/comparison/comparison_with_sql.rst
+++ b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -107,7 +107,7 @@ methods.
.. ipython:: python
frame = pd.DataFrame(
- {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]}
+ {"col1": ["A", "B", np.nan, "C", "D"], "col2": ["F", np.nan, "G", "H", "I"]}
)
frame
@@ -164,16 +164,16 @@ The pandas equivalent would be:
tips.groupby("sex").size()
-Notice that in the pandas code we used :meth:`~pandas.core.groupby.DataFrameGroupBy.size` and not
-:meth:`~pandas.core.groupby.DataFrameGroupBy.count`. This is because
-:meth:`~pandas.core.groupby.DataFrameGroupBy.count` applies the function to each column, returning
+Notice that in the pandas code we used :meth:`.DataFrameGroupBy.size` and not
+:meth:`.DataFrameGroupBy.count`. This is because
+:meth:`.DataFrameGroupBy.count` applies the function to each column, returning
the number of ``NOT NULL`` records within each.
.. ipython:: python
tips.groupby("sex").count()
-Alternatively, we could have applied the :meth:`~pandas.core.groupby.DataFrameGroupBy.count` method
+Alternatively, we could have applied the :meth:`.DataFrameGroupBy.count` method
to an individual column:
.. ipython:: python
@@ -181,7 +181,7 @@ to an individual column:
tips.groupby("sex")["total_bill"].count()
Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount
-differs by day of the week - :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` allows you to pass a dictionary
+differs by day of the week - :meth:`.DataFrameGroupBy.agg` allows you to pass a dictionary
to your grouped DataFrame, indicating which functions to apply to specific columns.
.. code-block:: sql
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index 1896dffa9a105..b9f7d64d4b2f8 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -21,7 +21,7 @@ Instructions for installing :ref:`from source `,
Python version support
----------------------
-Officially Python 3.9, 3.10 and 3.11.
+Officially Python 3.9, 3.10, 3.11 and 3.12.
Installing pandas
-----------------
@@ -206,6 +206,7 @@ Package Minimum support
`NumPy `__ 1.22.4
`python-dateutil `__ 2.8.2
`pytz `__ 2020.1
+`tzdata `__ 2022.7
================================================================ ==========================
.. _install.optional_dependencies:
@@ -238,22 +239,22 @@ Installable with ``pip install "pandas[performance]"``
===================================================== ================== ================== ===================================================================================================================================================================================
Dependency Minimum Version pip extra Notes
===================================================== ================== ================== ===================================================================================================================================================================================
-`numexpr `__ 2.8.0 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups
-`bottleneck `__ 1.3.4 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup.
-`numba `__ 0.55.2 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler.
+`numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups
+`bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup.
+`numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler.
===================================================== ================== ================== ===================================================================================================================================================================================
Visualization
^^^^^^^^^^^^^
-Installable with ``pip install "pandas[plot, output_formatting]"``.
+Installable with ``pip install "pandas[plot, output-formatting]"``.
========================= ================== ================== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== ================== =============================================================
-matplotlib 3.6.1 plot Plotting library
-Jinja2 3.1.2 output_formatting Conditional formatting with DataFrame.style
-tabulate 0.8.10 output_formatting Printing in Markdown-friendly format (see `tabulate`_)
+matplotlib 3.6.3 plot Plotting library
+Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style
+tabulate 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_)
========================= ================== ================== =============================================================
Computation
@@ -264,8 +265,8 @@ Installable with ``pip install "pandas[computation]"``.
========================= ================== =============== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
-SciPy 1.8.1 computation Miscellaneous statistical functions
-xarray 2022.03.0 computation pandas-like API for N-dimensional data
+SciPy 1.10.0 computation Miscellaneous statistical functions
+xarray 2022.12.0 computation pandas-like API for N-dimensional data
========================= ================== =============== =============================================================
Excel files
@@ -277,9 +278,10 @@ Installable with ``pip install "pandas[excel]"``.
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
xlrd 2.0.1 excel Reading Excel
-xlsxwriter 3.0.3 excel Writing Excel
-openpyxl 3.0.10 excel Reading / writing for xlsx files
-pyxlsb 1.0.9 excel Reading for xlsb files
+xlsxwriter 3.0.5 excel Writing Excel
+openpyxl 3.1.0 excel Reading / writing for xlsx files
+pyxlsb 1.0.10 excel Reading for xlsb files
+python-calamine 0.1.7 excel Reading for xls/xlsx/xlsb/ods files
========================= ================== =============== =============================================================
HTML
@@ -290,9 +292,9 @@ Installable with ``pip install "pandas[html]"``.
========================= ================== =============== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
-BeautifulSoup4 4.11.1 html HTML parser for read_html
+BeautifulSoup4 4.11.2 html HTML parser for read_html
html5lib 1.1 html HTML parser for read_html
-lxml 4.8.0 html HTML parser for read_html
+lxml 4.9.2 html HTML parser for read_html
========================= ================== =============== =============================================================
One of the following combinations of libraries is needed to use the
@@ -327,22 +329,24 @@ Installable with ``pip install "pandas[xml]"``.
========================= ================== =============== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
-lxml 4.8.0 xml XML parser for read_xml and tree builder for to_xml
+lxml 4.9.2 xml XML parser for read_xml and tree builder for to_xml
========================= ================== =============== =============================================================
SQL databases
^^^^^^^^^^^^^
-Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``.
+Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"``
========================= ================== =============== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
-SQLAlchemy 1.4.36 postgresql, SQL support for databases other than sqlite
+SQLAlchemy 2.0.0 postgresql, SQL support for databases other than sqlite
mysql,
sql-other
-psycopg2 2.9.3 postgresql PostgreSQL engine for sqlalchemy
+psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy
pymysql 1.0.2 mysql MySQL engine for sqlalchemy
+adbc-driver-postgresql 0.8.0 postgresql ADBC Driver for PostgreSQL
+adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite
========================= ================== =============== =============================================================
Other data sources
@@ -353,12 +357,12 @@ Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"``
========================= ================== ================ =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== ================ =============================================================
-PyTables 3.7.0 hdf5 HDF5-based reading / writing
-blosc 1.21.0 hdf5 Compression for HDF5; only available on ``conda``
+PyTables 3.8.0 hdf5 HDF5-based reading / writing
+blosc 1.21.3 hdf5 Compression for HDF5; only available on ``conda``
zlib hdf5 Compression for HDF5
-fastparquet 0.8.1 - Parquet reading / writing (pyarrow is default)
-pyarrow 7.0.0 parquet, feather Parquet, ORC, and feather reading / writing
-pyreadstat 1.1.5 spss SPSS files (.sav) reading
+fastparquet 2022.12.0 - Parquet reading / writing (pyarrow is default)
+pyarrow 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing
+pyreadstat 1.2.0 spss SPSS files (.sav) reading
odfpy 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing
========================= ================== ================ =============================================================
@@ -378,11 +382,11 @@ Installable with ``pip install "pandas[fss, aws, gcp]"``
========================= ================== =============== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
-fsspec 2022.05.0 fss, gcp, aws Handling files aside from simple local and HTTP (required
+fsspec 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required
dependency of s3fs, gcsfs).
-gcsfs 2022.05.0 gcp Google Cloud Storage access
-pandas-gbq 0.17.5 gcp Google Big Query access
-s3fs 2022.05.0 aws Amazon S3 access
+gcsfs 2022.11.0 gcp Google Cloud Storage access
+pandas-gbq 0.19.0 gcp Google Big Query access
+s3fs 2022.11.0 aws Amazon S3 access
========================= ================== =============== =============================================================
Clipboard
@@ -393,8 +397,8 @@ Installable with ``pip install "pandas[clipboard]"``.
========================= ================== =============== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
-PyQt4/PyQt5 5.15.6 clipboard Clipboard I/O
-qtpy 2.2.0 clipboard Clipboard I/O
+PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O
+qtpy 2.3.0 clipboard Clipboard I/O
========================= ================== =============== =============================================================
.. note::
@@ -411,9 +415,7 @@ Installable with ``pip install "pandas[compression]"``
========================= ================== =============== =============================================================
Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
-brotli 0.7.0 compression Brotli compression
-python-snappy 0.6.1 compression Snappy compression
-Zstandard 0.17.0 compression Zstandard compression
+Zstandard 0.19.0 compression Zstandard compression
========================= ================== =============== =============================================================
Consortium Standard
diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
index 2dcc8b0abe3b8..caaff3557ae40 100644
--- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
+++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
@@ -106,9 +106,9 @@ between square brackets ``[]``.
.. note::
- If you are familiar to Python
+ If you are familiar with Python
:ref:`dictionaries `, the selection of a
- single column is very similar to selection of dictionary values based on
+ single column is very similar to the selection of dictionary values based on
the key.
You can create a ``Series`` from scratch as well:
diff --git a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst
index 6a0b59b26350c..e4b34e3af57bf 100644
--- a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst
+++ b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst
@@ -266,7 +266,7 @@ For more information about :meth:`~DataFrame.pivot_table`, see the user guide se
::
- air_quality.groupby(["parameter", "location"]).mean()
+ air_quality.groupby(["parameter", "location"])[["value"]].mean()
.. raw:: html
diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst
index 470b3908802b2..b0530087e5b84 100644
--- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst
+++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst
@@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in
.. ipython:: python
- monthly_max = no_2.resample("M").max()
+ monthly_max = no_2.resample("ME").max()
monthly_max
A very powerful method on time series data with a datetime index, is the
diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst
index 1220c915c3cbc..4393c3716bdad 100644
--- a/doc/source/getting_started/tutorials.rst
+++ b/doc/source/getting_started/tutorials.rst
@@ -115,7 +115,6 @@ Various tutorials
* `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_
* `Financial analysis in Python, by Thomas Wiecki `_
* `Intro to pandas data structures, by Greg Reda `_
-* `Pandas and Python: Top 10, by Manish Amde `_
* `Pandas DataFrames Tutorial, by Karlijn Willems `_
* `A concise tutorial with real life examples `_
* `430+ Searchable Pandas recipes by Isshin Inada `_
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 41ddbd048e6c5..fe65364896f54 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -134,11 +134,6 @@ is the missing value for datetime data.
Timestamp
-.. autosummary::
- :toctree: api/
-
- NaT
-
Properties
~~~~~~~~~~
.. autosummary::
@@ -257,11 +252,6 @@ is the missing value for timedelta data.
Timedelta
-.. autosummary::
- :toctree: api/
-
- NaT
-
Properties
~~~~~~~~~~
.. autosummary::
@@ -465,7 +455,6 @@ pandas provides this through :class:`arrays.IntegerArray`.
UInt16Dtype
UInt32Dtype
UInt64Dtype
- NA
.. _api.arrays.float_na:
@@ -484,7 +473,6 @@ Nullable float
Float32Dtype
Float64Dtype
- NA
.. _api.arrays.categorical:
@@ -621,7 +609,6 @@ with a bool :class:`numpy.ndarray`.
:template: autosummary/class_without_autosummary.rst
BooleanDtype
- NA
.. Dtype attributes which are manually listed in their docstrings: including
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
index eb2529716b55e..e412793a328a3 100644
--- a/doc/source/reference/extensions.rst
+++ b/doc/source/reference/extensions.rst
@@ -34,11 +34,13 @@ objects.
api.extensions.ExtensionArray._accumulate
api.extensions.ExtensionArray._concat_same_type
+ api.extensions.ExtensionArray._explode
api.extensions.ExtensionArray._formatter
api.extensions.ExtensionArray._from_factorized
api.extensions.ExtensionArray._from_sequence
api.extensions.ExtensionArray._from_sequence_of_strings
api.extensions.ExtensionArray._hash_pandas_object
+ api.extensions.ExtensionArray._pad_or_backfill
api.extensions.ExtensionArray._reduce
api.extensions.ExtensionArray._values_for_argsort
api.extensions.ExtensionArray._values_for_factorize
@@ -47,6 +49,7 @@ objects.
api.extensions.ExtensionArray.copy
api.extensions.ExtensionArray.view
api.extensions.ExtensionArray.dropna
+ api.extensions.ExtensionArray.duplicated
api.extensions.ExtensionArray.equals
api.extensions.ExtensionArray.factorize
api.extensions.ExtensionArray.fillna
@@ -54,7 +57,6 @@ objects.
api.extensions.ExtensionArray.interpolate
api.extensions.ExtensionArray.isin
api.extensions.ExtensionArray.isna
- api.extensions.ExtensionArray.pad_or_backfill
api.extensions.ExtensionArray.ravel
api.extensions.ExtensionArray.repeat
api.extensions.ExtensionArray.searchsorted
diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst
index 02b0bf5d13dde..e93514de5f762 100644
--- a/doc/source/reference/general_functions.rst
+++ b/doc/source/reference/general_functions.rst
@@ -73,6 +73,13 @@ Top-level evaluation
eval
+Datetime formats
+~~~~~~~~~~~~~~~~
+.. autosummary::
+ :toctree: api/
+
+ tseries.api.guess_datetime_format
+
Hashing
~~~~~~~
.. autosummary::
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index 6d3ce3d31f005..7da02f7958416 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -53,6 +53,7 @@ are mentioned in the documentation.
options
extensions
testing
+ missing_value
.. This is to prevent warnings in the doc build. We don't want to encourage
.. these methods.
diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst
index 25e5b3b46b4f3..fa6105761df0a 100644
--- a/doc/source/reference/indexing.rst
+++ b/doc/source/reference/indexing.rst
@@ -489,3 +489,5 @@ Methods
PeriodIndex.asfreq
PeriodIndex.strftime
PeriodIndex.to_timestamp
+ PeriodIndex.from_fields
+ PeriodIndex.from_ordinals
diff --git a/doc/source/reference/missing_value.rst b/doc/source/reference/missing_value.rst
new file mode 100644
index 0000000000000..3bf22aef765d1
--- /dev/null
+++ b/doc/source/reference/missing_value.rst
@@ -0,0 +1,24 @@
+{{ header }}
+
+.. _api.missing_value:
+
+==============
+Missing values
+==============
+.. currentmodule:: pandas
+
+NA is the way to represent missing values for nullable dtypes (see below):
+
+.. autosummary::
+ :toctree: api/
+ :template: autosummary/class_without_autosummary.rst
+
+ NA
+
+NaT is the missing value for timedelta and datetime data (see below):
+
+.. autosummary::
+ :toctree: api/
+ :template: autosummary/class_without_autosummary.rst
+
+ NaT
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
index 5a43e5796d1d9..a4ea0ec396ceb 100644
--- a/doc/source/reference/series.rst
+++ b/doc/source/reference/series.rst
@@ -177,6 +177,7 @@ Reindexing / selection / label manipulation
:toctree: api/
Series.align
+ Series.case_when
Series.drop
Series.droplevel
Series.drop_duplicates
@@ -273,6 +274,19 @@ pandas provides dtype-specific methods under various accessors.
These are separate namespaces within :class:`Series` that only apply
to specific data types.
+.. autosummary::
+ :toctree: api/
+ :nosignatures:
+ :template: autosummary/accessor.rst
+
+ Series.str
+ Series.cat
+ Series.dt
+ Series.sparse
+ DataFrame.sparse
+ Index.str
+
+
=========================== =================================
Data Type Accessor
=========================== =================================
@@ -314,6 +328,7 @@ Datetime properties
Series.dt.weekday
Series.dt.dayofyear
Series.dt.day_of_year
+ Series.dt.days_in_month
Series.dt.quarter
Series.dt.is_month_start
Series.dt.is_month_end
@@ -327,6 +342,7 @@ Datetime properties
Series.dt.tz
Series.dt.freq
Series.dt.unit
+ Series.dt.normalize
Datetime methods
^^^^^^^^^^^^^^^^
@@ -456,22 +472,6 @@ strings and apply several methods to it. These can be accessed like
Series.str.isdecimal
Series.str.get_dummies
-..
- The following is needed to ensure the generated pages are created with the
- correct template (otherwise they would be created in the Series/Index class page)
-
-..
- .. autosummary::
- :toctree: api/
- :template: autosummary/accessor.rst
-
- Series.str
- Series.cat
- Series.dt
- Series.sparse
- DataFrame.sparse
- Index.str
-
.. _api.series.cat:
Categorical accessor
@@ -526,6 +526,46 @@ Sparse-dtype specific methods and attributes are provided under the
Series.sparse.from_coo
Series.sparse.to_coo
+
+.. _api.series.list:
+
+List accessor
+~~~~~~~~~~~~~
+
+Arrow list-dtype specific methods and attributes are provided under the
+``Series.list`` accessor.
+
+.. autosummary::
+ :toctree: api/
+ :template: autosummary/accessor_method.rst
+
+ Series.list.flatten
+ Series.list.len
+ Series.list.__getitem__
+
+
+.. _api.series.struct:
+
+Struct accessor
+~~~~~~~~~~~~~~~
+
+Arrow struct-dtype specific methods and attributes are provided under the
+``Series.struct`` accessor.
+
+.. autosummary::
+ :toctree: api/
+ :template: autosummary/accessor_attribute.rst
+
+ Series.struct.dtypes
+
+.. autosummary::
+ :toctree: api/
+ :template: autosummary/accessor_method.rst
+
+ Series.struct.field
+ Series.struct.explode
+
+
.. _api.series.flags:
Flags
diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
index 1a891dca839e3..c8e67710c85a9 100644
--- a/doc/source/user_guide/10min.rst
+++ b/doc/source/user_guide/10min.rst
@@ -451,7 +451,7 @@ Merge
Concat
~~~~~~
-pandas provides various facilities for easily combining together :class:`Series`` and
+pandas provides various facilities for easily combining together :class:`Series` and
:class:`DataFrame` objects with various kinds of set logic for the indexes
and relational algebra functionality in the case of join / merge-type
operations.
@@ -525,7 +525,7 @@ See the :ref:`Grouping section `.
df
Grouping by a column label, selecting column labels, and then applying the
-:meth:`~pandas.core.groupby.DataFrameGroupBy.sum` function to the resulting
+:meth:`.DataFrameGroupBy.sum` function to the resulting
groups:
.. ipython:: python
@@ -610,7 +610,7 @@ financial applications. See the :ref:`Time Series section `.
.. ipython:: python
- rng = pd.date_range("1/1/2012", periods=100, freq="S")
+ rng = pd.date_range("1/1/2012", periods=100, freq="s")
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample("5Min").sum()
diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
index 682fa4c9b4fcc..453536098cfbb 100644
--- a/doc/source/user_guide/advanced.rst
+++ b/doc/source/user_guide/advanced.rst
@@ -976,7 +976,7 @@ of :ref:`frequency aliases ` with datetime-like inter
pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W")
- pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H")
+ pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9h")
Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals
are closed on. Intervals are closed on the right side by default.
diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
index 2e299da5e5794..f7d89110e6c8f 100644
--- a/doc/source/user_guide/basics.rst
+++ b/doc/source/user_guide/basics.rst
@@ -269,7 +269,7 @@ using ``fillna`` if you wish).
.. ipython:: python
df2 = df.copy()
- df2["three"]["a"] = 1.0
+ df2.loc["a", "three"] = 1.0
df
df2
df + df2
@@ -408,20 +408,6 @@ raise a ValueError:
pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo'])
-Note that this is different from the NumPy behavior where a comparison can
-be broadcast:
-
-.. ipython:: python
-
- np.array([1, 2, 3]) == np.array([2])
-
-or it can return False if broadcasting can not be done:
-
-.. ipython:: python
- :okwarning:
-
- np.array([1, 2, 3]) == np.array([1, 2])
-
Combining overlapping data sets
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2021,7 +2007,7 @@ documentation sections for more on each type.
| | | | | ``'Int64'``, ``'UInt8'``, ``'UInt16'``,|
| | | | | ``'UInt32'``, ``'UInt64'`` |
+-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+
-| ``nullable float`` | :class:`Float64Dtype`, ...| (none) | :class:`arrays.FloatingArray` | ``'Float32'``, ``'Float64'`` |
+| :ref:`nullable float ` | :class:`Float64Dtype`, ...| (none) | :class:`arrays.FloatingArray` | ``'Float32'``, ``'Float64'`` |
+-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+
| :ref:`Strings ` | :class:`StringDtype` | :class:`str` | :class:`arrays.StringArray` | ``'string'`` |
+-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+
@@ -2275,23 +2261,6 @@ non-conforming elements intermixed that you want to represent as missing:
m = ["apple", pd.Timedelta("1day")]
pd.to_timedelta(m, errors="coerce")
-The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it
-encounters any errors with the conversion to a desired data type:
-
-.. ipython:: python
- :okwarning:
-
- import datetime
-
- m = ["apple", datetime.datetime(2016, 3, 2)]
- pd.to_datetime(m, errors="ignore")
-
- m = ["apple", 2, 3]
- pd.to_numeric(m, errors="ignore")
-
- m = ["apple", pd.Timedelta("1day")]
- pd.to_timedelta(m, errors="ignore")
-
In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the
option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory:
diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
index 9efa7df3ff669..8fb991dca02db 100644
--- a/doc/source/user_guide/categorical.rst
+++ b/doc/source/user_guide/categorical.rst
@@ -647,7 +647,7 @@ Pivot tables:
raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"])
df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]})
- pd.pivot_table(df, values="values", index=["A", "B"])
+ pd.pivot_table(df, values="values", index=["A", "B"], observed=False)
Data munging
------------
@@ -832,9 +832,6 @@ The following table summarizes the results of merging ``Categoricals``:
| category (int) | category (float) | False | float (dtype is inferred) |
+-------------------+------------------------+----------------------+-----------------------------+
-See also the section on :ref:`merge dtypes` for notes about
-preserving merge dtypes and performance.
-
.. _categorical.union:
Unioning
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
index 66ee571d6b5a5..b1a6aa8753be1 100644
--- a/doc/source/user_guide/cookbook.rst
+++ b/doc/source/user_guide/cookbook.rst
@@ -459,14 +459,14 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
df
# List the size of the animals with the highest weight.
- df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()])
+ df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False)
`Using get_group
`__
.. ipython:: python
- gb = df.groupby(["animal"])
+ gb = df.groupby("animal")
gb.get_group("cat")
`Apply to different items in a group
@@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"])
- expected_df = gb.apply(GrowUp)
+ expected_df = gb.apply(GrowUp, include_groups=False)
expected_df
`Expanding apply
@@ -771,7 +771,7 @@ To create year and month cross tabulation:
df = pd.DataFrame(
{"value": np.random.randn(36)},
- index=pd.date_range("2011-01-01", freq="M", periods=36),
+ index=pd.date_range("2011-01-01", freq="ME", periods=36),
)
pd.pivot_table(
@@ -794,12 +794,12 @@ Apply
index=["I", "II", "III"],
)
- def make_df(ser):
- new_vals = [pd.Series(value, name=name) for name, value in ser.items()]
- return pd.DataFrame(new_vals)
-
- df_orgz = pd.concat({ind: row.pipe(make_df) for ind, row in df.iterrows()})
+ def SeriesFromSubList(aList):
+ return pd.Series(aList)
+ df_orgz = pd.concat(
+ {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}
+ )
df_orgz
`Rolling apply with a DataFrame returning a Series
diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst
index 59bdb1926895f..a083297925007 100644
--- a/doc/source/user_guide/copy_on_write.rst
+++ b/doc/source/user_guide/copy_on_write.rst
@@ -6,11 +6,17 @@
Copy-on-Write (CoW)
*******************
+.. note::
+
+ Copy-on-Write will become the default in pandas 3.0. We recommend
+ :ref:`turning it on now `
+ to benefit from all improvements.
+
Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the
-optimizations that become possible through CoW are implemented and supported. A complete list
-can be found at :ref:`Copy-on-Write optimizations `.
+optimizations that become possible through CoW are implemented and supported. All possible
+optimizations are supported starting from pandas 2.1.
-We expect that CoW will be enabled by default in version 3.0.
+CoW will be enabled by default in version 3.0.
CoW will lead to more predictable behavior since it is not possible to update more than
one object with one statement, e.g. indexing operations or methods won't have side-effects. Additionally, through
@@ -20,7 +26,7 @@ Previous behavior
-----------------
pandas indexing behavior is tricky to understand. Some operations return views while
-other return copies. Depending on the result of the operation, mutation one object
+other return copies. Depending on the result of the operation, mutating one object
might accidentally mutate another:
.. ipython:: python
@@ -46,6 +52,105 @@ it explicitly disallows this. With CoW enabled, ``df`` is unchanged:
The following sections will explain what this means and how it impacts existing
applications.
+.. _copy_on_write.migration_guide:
+
+Migrating to Copy-on-Write
+--------------------------
+
+Copy-on-Write will be the default and only mode in pandas 3.0. This means that users
+need to migrate their code to be compliant with CoW rules.
+
+The default mode in pandas will raise warnings for certain cases that will actively
+change behavior and thus change user intended behavior.
+
+We added another mode, e.g.
+
+.. code-block:: python
+
+ pd.options.mode.copy_on_write = "warn"
+
+that will warn for every operation that will change behavior with CoW. We expect this mode
+to be very noisy, since many cases that we don't expect that they will influence users will
+also emit a warning. We recommend checking this mode and analyzing the warnings, but it is
+not necessary to address all of these warning. The first two items of the following lists
+are the only cases that need to be addressed to make existing code work with CoW.
+
+The following few items describe the user visible changes:
+
+**Chained assignment will never work**
+
+``loc`` should be used as an alternative. Check the
+:ref:`chained assignment section ` for more details.
+
+**Accessing the underlying array of a pandas object will return a read-only view**
+
+
+.. ipython:: python
+
+ ser = pd.Series([1, 2, 3])
+ ser.to_numpy()
+
+This example returns a NumPy array that is a view of the Series object. This view can
+be modified and thus also modify the pandas object. This is not compliant with CoW
+rules. The returned array is set to non-writeable to protect against this behavior.
+Creating a copy of this array allows modification. You can also make the array
+writeable again if you don't care about the pandas object anymore.
+
+See the section about :ref:`read-only NumPy arrays `
+for more details.
+
+**Only one pandas object is updated at once**
+
+The following code snippet updates both ``df`` and ``subset`` without CoW:
+
+.. ipython:: python
+
+ df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
+ subset = df["foo"]
+ subset.iloc[0] = 100
+ df
+
+This won't be possible anymore with CoW, since the CoW rules explicitly forbid this.
+This includes updating a single column as a :class:`Series` and relying on the change
+propagating back to the parent :class:`DataFrame`.
+This statement can be rewritten into a single statement with ``loc`` or ``iloc`` if
+this behavior is necessary. :meth:`DataFrame.where` is another suitable alternative
+for this case.
+
+Updating a column selected from a :class:`DataFrame` with an inplace method will
+also not work anymore.
+
+.. ipython:: python
+ :okwarning:
+
+ df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
+ df["foo"].replace(1, 5, inplace=True)
+ df
+
+This is another form of chained assignment. This can generally be rewritten in 2
+different forms:
+
+.. ipython:: python
+
+ df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
+ df.replace({"foo": {1: 5}}, inplace=True)
+ df
+
+A different alternative would be to not use ``inplace``:
+
+.. ipython:: python
+
+ df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
+ df["foo"] = df["foo"].replace(1, 5)
+ df
+
+**Constructors now copy NumPy arrays by default**
+
+The Series and DataFrame constructors will now copy NumPy array by default when not
+otherwise specified. This was changed to avoid mutating a pandas object when the
+NumPy array is changed inplace outside of pandas. You can set ``copy=False`` to
+avoid this copy.
+
Description
-----------
@@ -123,6 +228,8 @@ CoW triggers a copy when ``df`` is changed to avoid mutating ``view`` as well:
df
view
+.. _copy_on_write_chained_assignment:
+
Chained Assignment
------------------
@@ -130,6 +237,7 @@ Chained assignment references a technique where an object is updated through
two subsequent indexing operations, e.g.
.. ipython:: python
+ :okwarning:
with pd.option_context("mode.copy_on_write", False):
df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
@@ -154,6 +262,79 @@ With copy on write this can be done by using ``loc``.
df.loc[df["bar"] > 5, "foo"] = 100
+.. _copy_on_write_read_only_na:
+
+Read-only NumPy arrays
+----------------------
+
+Accessing the underlying NumPy array of a DataFrame will return a read-only array if the array
+shares data with the initial DataFrame:
+
+The array is a copy if the initial DataFrame consists of more than one array:
+
+
+.. ipython:: python
+
+ df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]})
+ df.to_numpy()
+
+The array shares data with the DataFrame if the DataFrame consists of only one NumPy array:
+
+.. ipython:: python
+
+ df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+ df.to_numpy()
+
+This array is read-only, which means that it can't be modified inplace:
+
+.. ipython:: python
+ :okexcept:
+
+ arr = df.to_numpy()
+ arr[0, 0] = 100
+
+The same holds true for a Series, since a Series always consists of a single array.
+
+There are two potential solution to this:
+
+- Trigger a copy manually if you want to avoid updating DataFrames that share memory with your array.
+- Make the array writeable. This is a more performant solution but circumvents Copy-on-Write rules, so
+ it should be used with caution.
+
+.. ipython:: python
+
+ arr = df.to_numpy()
+ arr.flags.writeable = True
+ arr[0, 0] = 100
+ arr
+
+Patterns to avoid
+-----------------
+
+No defensive copy will be performed if two objects share the same data while
+you are modifying one object inplace.
+
+.. ipython:: python
+
+ df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+ df2 = df.reset_index(drop=True)
+ df2.iloc[0, 0] = 100
+
+This creates two objects that share data and thus the setitem operation will trigger a
+copy. This is not necessary if the initial object ``df`` isn't needed anymore.
+Simply reassigning to the same variable will invalidate the reference that is
+held by the object.
+
+.. ipython:: python
+
+ df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+ df = df.reset_index(drop=True)
+ df.iloc[0, 0] = 100
+
+No copy is necessary in this example.
+Creating multiple references keeps unnecessary references alive
+and thus will hurt performance with Copy-on-Write.
+
.. _copy_on_write.optimizations:
Copy-on-Write optimizations
@@ -161,63 +342,14 @@ Copy-on-Write optimizations
A new lazy copy mechanism that defers the copy until the object in question is modified
and only if this object shares data with another object. This mechanism was added to
-following methods:
-
- - :meth:`DataFrame.reset_index` / :meth:`Series.reset_index`
- - :meth:`DataFrame.set_index`
- - :meth:`DataFrame.set_axis` / :meth:`Series.set_axis`
- - :meth:`DataFrame.set_flags` / :meth:`Series.set_flags`
- - :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis`
- - :meth:`DataFrame.reindex` / :meth:`Series.reindex`
- - :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like`
- - :meth:`DataFrame.assign`
- - :meth:`DataFrame.drop`
- - :meth:`DataFrame.dropna` / :meth:`Series.dropna`
- - :meth:`DataFrame.select_dtypes`
- - :meth:`DataFrame.align` / :meth:`Series.align`
- - :meth:`Series.to_frame`
- - :meth:`DataFrame.rename` / :meth:`Series.rename`
- - :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix`
- - :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix`
- - :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates`
- - :meth:`DataFrame.droplevel` / :meth:`Series.droplevel`
- - :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels`
- - :meth:`DataFrame.between_time` / :meth:`Series.between_time`
- - :meth:`DataFrame.filter` / :meth:`Series.filter`
- - :meth:`DataFrame.head` / :meth:`Series.head`
- - :meth:`DataFrame.tail` / :meth:`Series.tail`
- - :meth:`DataFrame.isetitem`
- - :meth:`DataFrame.pipe` / :meth:`Series.pipe`
- - :meth:`DataFrame.pop` / :meth:`Series.pop`
- - :meth:`DataFrame.replace` / :meth:`Series.replace`
- - :meth:`DataFrame.shift` / :meth:`Series.shift`
- - :meth:`DataFrame.sort_index` / :meth:`Series.sort_index`
- - :meth:`DataFrame.sort_values` / :meth:`Series.sort_values`
- - :meth:`DataFrame.squeeze` / :meth:`Series.squeeze`
- - :meth:`DataFrame.swapaxes`
- - :meth:`DataFrame.swaplevel` / :meth:`Series.swaplevel`
- - :meth:`DataFrame.take` / :meth:`Series.take`
- - :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp`
- - :meth:`DataFrame.to_period` / :meth:`Series.to_period`
- - :meth:`DataFrame.truncate`
- - :meth:`DataFrame.iterrows`
- - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
- - :meth:`DataFrame.fillna` / :meth:`Series.fillna`
- - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate`
- - :meth:`DataFrame.ffill` / :meth:`Series.ffill`
- - :meth:`DataFrame.bfill` / :meth:`Series.bfill`
- - :meth:`DataFrame.where` / :meth:`Series.where`
- - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
- - :meth:`DataFrame.astype` / :meth:`Series.astype`
- - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes`
- - :meth:`DataFrame.join`
- - :meth:`DataFrame.eval`
- - :func:`concat`
- - :func:`merge`
+methods that don't require a copy of the underlying data. Popular examples are :meth:`DataFrame.drop` for ``axis=1``
+and :meth:`DataFrame.rename`.
These methods return views when Copy-on-Write is enabled, which provides a significant
performance improvement compared to the regular execution.
+.. _copy_on_write_enabling:
+
How to enable CoW
-----------------
diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst
index d60532f5f4027..d1e981ee1bbdc 100644
--- a/doc/source/user_guide/dsintro.rst
+++ b/doc/source/user_guide/dsintro.rst
@@ -308,7 +308,7 @@ The row and column labels can be accessed respectively by accessing the
From dict of ndarrays / lists
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The ndarrays must all be the same length. If an index is passed, it must
+All ndarrays must share the same length. If an index is passed, it must
also be the same length as the arrays. If no index is passed, the
result will be ``range(n)``, where ``n`` is the array length.
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
index 2ddc3e709be85..8c510173819e0 100644
--- a/doc/source/user_guide/enhancingperf.rst
+++ b/doc/source/user_guide/enhancingperf.rst
@@ -183,8 +183,8 @@ can be improved by passing an ``np.ndarray``.
...: return s * dx
...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b,
...: np.ndarray col_N):
- ...: assert (col_a.dtype == np.float_
- ...: and col_b.dtype == np.float_ and col_N.dtype == np.int_)
+ ...: assert (col_a.dtype == np.float64
+ ...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int))
...: cdef Py_ssize_t i, n = len(col_N)
...: assert (len(col_a) == len(col_b) == n)
...: cdef np.ndarray[double] res = np.empty(n)
@@ -453,7 +453,7 @@ by evaluate arithmetic and boolean expression all at once for large :class:`~pan
:func:`~pandas.eval` is many orders of magnitude slower for
smaller expressions or objects than plain Python. A good rule of thumb is
to only use :func:`~pandas.eval` when you have a
- :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows.
+ :class:`.DataFrame` with more than 10,000 rows.
Supported syntax
~~~~~~~~~~~~~~~~
diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
index 67106df328361..99c85ac66623d 100644
--- a/doc/source/user_guide/gotchas.rst
+++ b/doc/source/user_guide/gotchas.rst
@@ -327,7 +327,7 @@ present in the more domain-specific statistical programming language `R
``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64``
``numpy.object_`` | ``object_``
``numpy.bool_`` | ``bool_``
- ``numpy.character`` | ``string_, unicode_``
+ ``numpy.character`` | ``bytes_, str_``
The R language, by contrast, only has a handful of built-in data types:
``integer``, ``numeric`` (floating-point), ``character``, and
@@ -379,7 +379,7 @@ constructors using something similar to the following:
.. ipython:: python
x = np.array(list(range(10)), ">i4") # big endian
- newx = x.byteswap().newbyteorder() # force native byteorder
+ newx = x.byteswap().view(x.dtype.newbyteorder()) # force native byteorder
s = pd.Series(newx)
See `the NumPy documentation on byte order
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
index 75c816f66d5e4..11863f8aead31 100644
--- a/doc/source/user_guide/groupby.rst
+++ b/doc/source/user_guide/groupby.rst
@@ -13,10 +13,8 @@ steps:
* **Applying** a function to each group independently.
* **Combining** the results into a data structure.
-Out of these, the split step is the most straightforward. In fact, in many
-situations we may wish to split the data set into groups and do something with
-those groups. In the apply step, we might wish to do one of the
-following:
+Out of these, the split step is the most straightforward. In the apply step, we
+might wish to do one of the following:
* **Aggregation**: compute a summary statistic (or statistics) for each
group. Some examples:
@@ -53,9 +51,7 @@ of the above three categories.
function.
-Since the set of object instance methods on pandas data structures is generally
-rich and expressive, we often simply want to invoke, say, a DataFrame function
-on each group. The name GroupBy should be quite familiar to those who have used
+The name GroupBy should be quite familiar to those who have used
a SQL-based tool (or ``itertools``), in which you can write code like:
.. code-block:: sql
@@ -65,7 +61,7 @@ a SQL-based tool (or ``itertools``), in which you can write code like:
GROUP BY Column1, Column2
We aim to make operations like this natural and easy to express using
-pandas. We'll address each area of GroupBy functionality then provide some
+pandas. We'll address each area of GroupBy functionality, then provide some
non-trivial examples / use cases.
See the :ref:`cookbook` for some advanced strategies.
@@ -134,6 +130,7 @@ We could naturally group by either the ``A`` or ``B`` columns, or both:
.. ipython:: python
grouped = df.groupby("A")
+ grouped = df.groupby("B")
grouped = df.groupby(["A", "B"])
.. note::
@@ -170,9 +167,11 @@ output of aggregation functions will only contain unique index values:
.. ipython:: python
- lst = [1, 2, 3, 1, 2, 3]
+ index = [1, 2, 3, 1, 2, 3]
- s = pd.Series([1, 2, 3, 10, 20, 30], lst)
+ s = pd.Series([1, 2, 3, 10, 20, 30], index=index)
+
+ s
grouped = s.groupby(level=0)
@@ -211,9 +210,9 @@ For example, the groups created by ``groupby()`` below are in the order they app
.. ipython:: python
df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]})
- df3.groupby(["X"]).get_group("A")
+ df3.groupby("X").get_group("A")
- df3.groupby(["X"]).get_group("B")
+ df3.groupby(["X"]).get_group(("B",))
.. _groupby.dropna:
@@ -256,8 +255,8 @@ above example we have:
df.groupby("A").groups
df.T.groupby(get_letter_type).groups
-Calling the standard Python ``len`` function on the GroupBy object just returns
-the length of the ``groups`` dict, so it is largely just a convenience:
+Calling the standard Python ``len`` function on the GroupBy object returns
+the number of groups, which is the same as the length of the ``groups`` dictionary:
.. ipython:: python
@@ -268,7 +267,7 @@ the length of the ``groups`` dict, so it is largely just a convenience:
.. _groupby.tabcompletion:
-``GroupBy`` will tab complete column names (and other attributes):
+``GroupBy`` will tab complete column names, GroupBy operations, and other attributes:
.. ipython:: python
@@ -420,6 +419,12 @@ This is mainly syntactic sugar for the alternative, which is much more verbose:
Additionally, this method avoids recomputing the internal grouping information
derived from the passed key.
+You can also include the grouping columns if you want to operate on them.
+
+.. ipython:: python
+
+ grouped[["A", "B"]].sum()
+
.. _groupby.iterating-label:
Iterating through groups
@@ -452,7 +457,7 @@ Selecting a group
-----------------
A single group can be selected using
-:meth:`~pandas.core.groupby.DataFrameGroupBy.get_group`:
+:meth:`.DataFrameGroupBy.get_group`:
.. ipython:: python
@@ -499,7 +504,7 @@ Built-in aggregation methods
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Many common aggregations are built-in to GroupBy objects as methods. Of the methods
-listed below, those with a ``*`` do *not* have a Cython-optimized implementation.
+listed below, those with a ``*`` do *not* have an efficient, GroupBy-specific, implementation.
.. csv-table::
:header: "Method", "Description"
@@ -511,8 +516,8 @@ listed below, those with a ``*`` do *not* have a Cython-optimized implementation
:meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups
:meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups
:meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group
- :meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group
- :meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group
+ :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group
+ :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group
:meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group
:meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group
:meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group
@@ -535,16 +540,16 @@ Some examples:
df.groupby("A")[["C", "D"]].max()
df.groupby(["A", "B"]).mean()
-Another simple aggregation example is to compute the size of each group.
+Another aggregation example is to compute the size of each group.
This is included in GroupBy as the ``size`` method. It returns a Series whose
-index are the group names and whose values are the sizes of each group.
+index consists of the group names and the values are the sizes of each group.
.. ipython:: python
grouped = df.groupby(["A", "B"])
grouped.size()
-While the :meth:`~.DataFrameGroupBy.describe` method is not itself a reducer, it
+While the :meth:`.DataFrameGroupBy.describe` method is not itself a reducer, it
can be used to conveniently produce a collection of summary statistics about each of
the groups.
@@ -553,7 +558,7 @@ the groups.
grouped.describe()
Another aggregation example is to compute the number of unique values of each group.
-This is similar to the ``value_counts`` function, except that it only counts the
+This is similar to the :meth:`.DataFrameGroupBy.value_counts` function, except that it only counts the
number of unique values.
.. ipython:: python
@@ -566,11 +571,11 @@ number of unique values.
.. note::
Aggregation functions **will not** return the groups that you are aggregating over
- as named *columns*, when ``as_index=True``, the default. The grouped columns will
+ as named *columns* when ``as_index=True``, the default. The grouped columns will
be the **indices** of the returned object.
- Passing ``as_index=False`` **will** return the groups that you are aggregating over, if they are
- named **indices** or *columns*.
+ Passing ``as_index=False`` **will** return the groups that you are aggregating over as
+ named columns, regardless if they are named **indices** or *columns* in the inputs.
.. _groupby.aggregate.agg:
@@ -596,7 +601,7 @@ Any reduction method that pandas implements can be passed as a string to
grouped.agg("sum")
The result of the aggregation will have the group names as the
-new index along the grouped axis. In the case of multiple keys, the result is a
+new index. In the case of multiple keys, the result is a
:ref:`MultiIndex ` by default. As mentioned above, this can be
changed by using the ``as_index`` option:
@@ -650,16 +655,17 @@ different dtypes, then a common dtype will be determined in the same way as ``Da
Applying multiple functions at once
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-With grouped ``Series`` you can also pass a list or dict of functions to do
-aggregation with, outputting a DataFrame:
+On a grouped ``Series``, you can pass a list or dict of functions to
+:meth:`SeriesGroupBy.agg`, outputting a DataFrame:
.. ipython:: python
grouped = df.groupby("A")
grouped["C"].agg(["sum", "mean", "std"])
-On a grouped ``DataFrame``, you can pass a list of functions to apply to each
-column, which produces an aggregated result with a hierarchical index:
+On a grouped ``DataFrame``, you can pass a list of functions to
+:meth:`DataFrameGroupBy.agg` to aggregate each
+column, which produces an aggregated result with a hierarchical column index:
.. ipython:: python
@@ -824,8 +830,7 @@ A common use of a transformation is to add the result back into the original Dat
Built-in transformation methods
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The following methods on GroupBy act as transformations. Of these methods, only
-``fillna`` does not have a Cython-optimized implementation.
+The following methods on GroupBy act as transformations.
.. csv-table::
:header: "Method", "Description"
@@ -840,15 +845,14 @@ The following methods on GroupBy act as transformations. Of these methods, only
:meth:`~.DataFrameGroupBy.cumsum`;Compute the cumulative sum within each group
:meth:`~.DataFrameGroupBy.diff`;Compute the difference between adjacent values within each group
:meth:`~.DataFrameGroupBy.ffill`;Forward fill NA values within each group
- :meth:`~.DataFrameGroupBy.fillna`;Fill NA values within each group
:meth:`~.DataFrameGroupBy.pct_change`;Compute the percent change between adjacent values within each group
:meth:`~.DataFrameGroupBy.rank`;Compute the rank of each value within each group
:meth:`~.DataFrameGroupBy.shift`;Shift values up or down within each group
In addition, passing any built-in aggregation method as a string to
:meth:`~.DataFrameGroupBy.transform` (see the next section) will broadcast the result
-across the group, producing a transformed result. If the aggregation method is
-Cython-optimized, this will be performant as well.
+across the group, producing a transformed result. If the aggregation method has an efficient
+implementation, this will be performant as well.
.. _groupby.transformation.transform:
@@ -890,7 +894,7 @@ also accept User-Defined Functions (UDFs). The UDF must:
the built-in methods.
All of the examples in this section can be made more performant by calling
- built-in methods instead of using ``transform``.
+ built-in methods instead of using UDFs.
See :ref:`below for examples `.
.. versionchanged:: 2.0.0
@@ -921,7 +925,7 @@ Suppose we wish to standardize the data within each group:
We would expect the result to now have mean 0 and standard deviation 1 within
-each group, which we can easily check:
+each group (up to floating-point error), which we can easily check:
.. ipython:: python
@@ -995,18 +999,18 @@ using a UDF is commented out and the faster alternative appears below.
.. ipython:: python
- # ts.groupby(lambda x: x.year).transform(
+ # result = ts.groupby(lambda x: x.year).transform(
# lambda x: (x - x.mean()) / x.std()
# )
grouped = ts.groupby(lambda x: x.year)
result = (ts - grouped.transform("mean")) / grouped.transform("std")
- # ts.groupby(lambda x: x.year).transform(lambda x: x.max() - x.min())
+ # result = ts.groupby(lambda x: x.year).transform(lambda x: x.max() - x.min())
grouped = ts.groupby(lambda x: x.year)
result = grouped.transform("max") - grouped.transform("min")
# grouped = data_df.groupby(key)
- # grouped.transform(lambda x: x.fillna(x.mean()))
+ # result = grouped.transform(lambda x: x.fillna(x.mean()))
grouped = data_df.groupby(key)
result = data_df.fillna(grouped.transform("mean"))
@@ -1053,14 +1057,14 @@ missing values with the ``ffill()`` method.
).set_index("date")
df_re
- df_re.groupby("group").resample("1D").ffill()
+ df_re.groupby("group").resample("1D", include_groups=False).ffill()
.. _groupby.filter:
Filtration
----------
-A filtration is a GroupBy operation the subsets the original grouping object. It
+A filtration is a GroupBy operation that subsets the original grouping object. It
may either filter out entire groups, part of groups, or both. Filtrations return
a filtered version of the calling object, including the grouping columns when provided.
In the following example, ``class`` is included in the result.
@@ -1085,8 +1089,8 @@ Filtrations will respect subsetting the columns of the GroupBy object.
Built-in filtrations
~~~~~~~~~~~~~~~~~~~~
-The following methods on GroupBy act as filtrations. All these methods have a
-Cython-optimized implementation.
+The following methods on GroupBy act as filtrations. All these methods have an
+efficient, GroupBy-specific, implementation.
.. csv-table::
:header: "Method", "Description"
@@ -1207,6 +1211,19 @@ The dimension of the returned result can also change:
grouped.apply(f)
+``apply`` on a Series can operate on a returned value from the applied function
+that is itself a series, and possibly upcast the result to a DataFrame:
+
+.. ipython:: python
+
+ def f(x):
+ return pd.Series([x, x ** 2], index=["x", "x^2"])
+
+
+ s = pd.Series(np.random.rand(5))
+ s
+ s.apply(f)
+
Similar to :ref:`groupby.aggregate.agg`, the resulting dtype will reflect that of the
apply function. If the results from different groups have different dtypes, then
a common dtype will be determined in the same way as ``DataFrame`` construction.
@@ -1219,13 +1236,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare
.. ipython:: python
- df.groupby("A", group_keys=True).apply(lambda x: x)
+ df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False)
with
.. ipython:: python
- df.groupby("A", group_keys=False).apply(lambda x: x)
+ df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False)
Numba Accelerated Routines
@@ -1250,8 +1267,8 @@ will be passed into ``values``, and the group index will be passed into ``index`
Other useful features
---------------------
-Exclusion of "nuisance" columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Exclusion of non-numeric columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Again consider the example DataFrame we've been looking at:
@@ -1261,8 +1278,8 @@ Again consider the example DataFrame we've been looking at:
Suppose we wish to compute the standard deviation grouped by the ``A``
column. There is a slight problem, namely that we don't care about the data in
-column ``B`` because it is not numeric. We refer to these non-numeric columns as
-"nuisance" columns. You can avoid nuisance columns by specifying ``numeric_only=True``:
+column ``B`` because it is not numeric. You can avoid non-numeric columns by
+specifying ``numeric_only=True``:
.. ipython:: python
@@ -1289,17 +1306,8 @@ is only needed over one column (here ``colname``), it may be filtered
],
}
)
-
- # Decimal columns can be sum'd explicitly by themselves...
df_dec.groupby(["id"])[["dec_column"]].sum()
- # ...but cannot be combined with standard data types or they will be excluded
- df_dec.groupby(["id"])[["int_column", "dec_column"]].sum()
-
- # Use .agg function to aggregate over standard and "nuisance" data types
- # at the same time
- df_dec.groupby(["id"]).agg({"int_column": "sum", "dec_column": "sum"})
-
.. _groupby.observed:
Handling of (un)observed Categorical values
@@ -1331,35 +1339,55 @@ The returned dtype of the grouped will *always* include *all* of the categories
s = (
pd.Series([1, 1, 1])
- .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False)
+ .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=True)
.count()
)
s.index.dtype
.. _groupby.missing:
-NA and NaT group handling
-~~~~~~~~~~~~~~~~~~~~~~~~~
+NA group handling
+~~~~~~~~~~~~~~~~~
+
+By ``NA``, we are referring to any ``NA`` values, including
+:class:`NA`, ``NaN``, ``NaT``, and ``None``. If there are any ``NA`` values in the
+grouping key, by default these will be excluded. In other words, any
+"``NA`` group" will be dropped. You can include NA groups by specifying ``dropna=False``.
-If there are any NaN or NaT values in the grouping key, these will be
-automatically excluded. In other words, there will never be an "NA group" or
-"NaT group". This was not the case in older versions of pandas, but users were
-generally discarding the NA group anyway (and supporting it was an
-implementation headache).
+.. ipython:: python
+
+ df = pd.DataFrame({"key": [1.0, 1.0, np.nan, 2.0, np.nan], "A": [1, 2, 3, 4, 5]})
+ df
+
+ df.groupby("key", dropna=True).sum()
+
+ df.groupby("key", dropna=False).sum()
Grouping with ordered factors
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Categorical variables represented as instances of pandas's ``Categorical`` class
-can be used as group keys. If so, the order of the levels will be preserved:
+can be used as group keys. If so, the order of the levels will be preserved. When
+``observed=False`` and ``sort=False``, any unobserved categories will be at the
+end of the result in order.
.. ipython:: python
- data = pd.Series(np.random.randn(100))
+ days = pd.Categorical(
+ values=["Wed", "Mon", "Thu", "Mon", "Wed", "Sat"],
+ categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
+ )
+ data = pd.DataFrame(
+ {
+ "day": days,
+ "workers": [3, 4, 1, 4, 2, 2],
+ }
+ )
+ data
- factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0])
+ data.groupby("day", observed=False, sort=True).sum()
- data.groupby(factor, observed=False).mean()
+ data.groupby("day", observed=False, sort=False).sum()
.. _groupby.specify:
@@ -1397,19 +1425,20 @@ Groupby a specific column with the desired frequency. This is like resampling.
.. ipython:: python
- df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum()
+ df.groupby([pd.Grouper(freq="1ME", key="Date"), "Buyer"])[["Quantity"]].sum()
When ``freq`` is specified, the object returned by ``pd.Grouper`` will be an
-instance of ``pandas.api.typing.TimeGrouper``. You have an ambiguous specification
-in that you have a named index and a column that could be potential groupers.
+instance of ``pandas.api.typing.TimeGrouper``. When there is a column and index
+with the same name, you can use ``key`` to group by the column and ``level``
+to group by the index.
.. ipython:: python
df = df.set_index("Date")
df["Date"] = df.index + pd.offsets.MonthEnd(2)
- df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum()
+ df.groupby([pd.Grouper(freq="6ME", key="Date"), "Buyer"])[["Quantity"]].sum()
- df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum()
+ df.groupby([pd.Grouper(freq="6ME", level="Date"), "Buyer"])[["Quantity"]].sum()
Taking the first rows of each group
@@ -1512,7 +1541,7 @@ Enumerate groups
To see the ordering of the groups (as opposed to the order of rows
within a group given by ``cumcount``) you can use
-:meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`.
+:meth:`.DataFrameGroupBy.ngroup`.
@@ -1594,7 +1623,7 @@ code more readable. First we set the data:
)
df.head(2)
-Now, to find prices per store/product, we can simply do:
+We now find the prices per store/product.
.. ipython:: python
@@ -1624,24 +1653,12 @@ object as a parameter into the function you specify.
Examples
--------
-Regrouping by factor
-~~~~~~~~~~~~~~~~~~~~
-
-Regroup columns of a DataFrame according to their sum, and sum the aggregated ones.
-
-.. ipython:: python
-
- df = pd.DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "c": [1, 0, 0], "d": [2, 3, 4]})
- df
- dft = df.T
- dft.groupby(dft.sum()).sum()
-
.. _groupby.multicolumn_factorization:
Multi-column factorization
~~~~~~~~~~~~~~~~~~~~~~~~~~
-By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract
+By using :meth:`.DataFrameGroupBy.ngroup`, we can extract
information about the groups in a way similar to :func:`factorize` (as described
further in the :ref:`reshaping API `) but which applies
naturally to multiple columns of mixed type and different
@@ -1670,7 +1687,7 @@ Resampling produces new hypothetical samples (resamples) from already existing o
In order for resample to work on indices that are non-datetimelike, the following procedure can be utilized.
-In the following examples, **df.index // 5** returns a binary array which is used to determine what gets selected for the groupby operation.
+In the following examples, **df.index // 5** returns an integer array which is used to determine what gets selected for the groupby operation.
.. note::
@@ -1709,7 +1726,7 @@ column index name will be used as the name of the inserted column:
result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
return pd.Series(result, name="metrics")
- result = df.groupby("a").apply(compute_metrics)
+ result = df.groupby("a").apply(compute_metrics, include_groups=False)
result
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
index 52bc43f52b1d3..4954ee1538697 100644
--- a/doc/source/user_guide/indexing.rst
+++ b/doc/source/user_guide/indexing.rst
@@ -62,6 +62,8 @@ of multi-axis indexing.
* A boolean array (any ``NA`` values will be treated as ``False``).
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).
+ * A tuple of row (and column) indices whose elements are one of the
+ above inputs.
See more at :ref:`Selection by Label `.
@@ -78,6 +80,8 @@ of multi-axis indexing.
* A boolean array (any ``NA`` values will be treated as ``False``).
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).
+ * A tuple of row (and column) indices whose elements are one of the
+ above inputs.
See more at :ref:`Selection by Position `,
:ref:`Advanced Indexing ` and :ref:`Advanced
@@ -85,6 +89,12 @@ of multi-axis indexing.
* ``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. See more at :ref:`Selection By Callable `.
+ .. note::
+
+ Destructuring tuple keys into row (and column) indexes occurs
+ *before* callables are applied, so you cannot return a tuple from
+ a callable to index both rows and columns.
+
Getting values from an object with multi-axes selection uses the following
notation (using ``.loc`` as an example, but the following applies to ``.iloc`` as
well). Any of the axes accessors may be the null slice ``:``. Axes left out of
@@ -450,6 +460,8 @@ The ``.iloc`` attribute is the primary access method. The following are valid in
* A slice object with ints ``1:7``.
* A boolean array.
* A ``callable``, see :ref:`Selection By Callable `.
+* A tuple of row (and column) indexes, whose elements are one of the
+ above types.
.. ipython:: python
@@ -553,6 +565,12 @@ Selection by callable
``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer.
The ``callable`` must be a function with one argument (the calling Series or DataFrame) that returns valid output for indexing.
+.. note::
+
+ For ``.iloc`` indexing, returning a tuple from the callable is
+ not supported, since tuple destructuring for row and column indexes
+ occurs *before* applying callables.
+
.. ipython:: python
df1 = pd.DataFrame(np.random.randn(6, 4),
@@ -1709,6 +1727,22 @@ You can assign a custom index to the ``index`` attribute:
Returning a view versus a copy
------------------------------
+.. warning::
+
+ :ref:`Copy-on-Write `
+ will become the new default in pandas 3.0. This means than chained indexing will
+ never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary
+ anymore.
+ See :ref:`this section `
+ for more context.
+ We recommend turning Copy-on-Write on to leverage the improvements with
+
+ ```
+ pd.options.mode.copy_on_write = True
+ ```
+
+ even before pandas 3.0 is available.
+
When setting values in a pandas object, care must be taken to avoid what is called
``chained indexing``. Here is an example.
@@ -1747,6 +1781,22 @@ faster, and allows one to index *both* axes if so desired.
Why does assignment fail when using chained indexing?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. warning::
+
+ :ref:`Copy-on-Write `
+ will become the new default in pandas 3.0. This means than chained indexing will
+ never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary
+ anymore.
+ See :ref:`this section `
+ for more context.
+ We recommend turning Copy-on-Write on to leverage the improvements with
+
+ ```
+ pd.options.mode.copy_on_write = True
+ ```
+
+ even before pandas 3.0 is available.
+
The problem in the previous section is just a performance issue. What's up with
the ``SettingWithCopy`` warning? We don't **usually** throw warnings around when
you do something that might cost a few extra milliseconds!
@@ -1803,6 +1853,22 @@ Yikes!
Evaluation order matters
~~~~~~~~~~~~~~~~~~~~~~~~
+.. warning::
+
+ :ref:`Copy-on-Write `
+ will become the new default in pandas 3.0. This means than chained indexing will
+ never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary
+ anymore.
+ See :ref:`this section `
+ for more context.
+ We recommend turning Copy-on-Write on to leverage the improvements with
+
+ ```
+ pd.options.mode.copy_on_write = True
+ ```
+
+ even before pandas 3.0 is available.
+
When you use chained indexing, the order and type of the indexing operation
partially determine whether the result is a slice into the original object, or
a copy of the slice.
@@ -1837,7 +1903,7 @@ This however is operating on a copy and will not work.
:okwarning:
:okexcept:
- with option_context('mode.chained_assignment','warn'):
+ with pd.option_context('mode.chained_assignment','warn'):
dfb[dfb['a'].str.startswith('o')]['c'] = 42
A chained assignment can also crop up in setting in a mixed dtype frame.
@@ -1879,7 +1945,7 @@ Last, the subsequent example will **not** work at all, and so should be avoided:
:okwarning:
:okexcept:
- with option_context('mode.chained_assignment','raise'):
+ with pd.option_context('mode.chained_assignment','raise'):
dfd.loc[0]['a'] = 1111
.. warning::
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 6e352c52cd60e..b3ad23e0d4104 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -81,6 +81,9 @@ delim_whitespace : boolean, default False
If this option is set to ``True``, nothing should be passed in for the
``delimiter`` parameter.
+ .. deprecated: 2.2.0
+ Use ``sep="\\s+" instead.
+
Column and index locations and names
++++++++++++++++++++++++++++++++++++
@@ -836,6 +839,7 @@ order) and the new column names will be the concatenation of the component
column names:
.. ipython:: python
+ :okwarning:
data = (
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
@@ -856,6 +860,7 @@ By default the parser removes the component date columns, but you can choose
to retain them via the ``keep_date_col`` keyword:
.. ipython:: python
+ :okwarning:
df = pd.read_csv(
"tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
@@ -871,6 +876,7 @@ single column.
You can also use a dict to specify custom name columns:
.. ipython:: python
+ :okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
@@ -883,6 +889,7 @@ data columns:
.. ipython:: python
+ :okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv(
@@ -902,6 +909,10 @@ data columns:
for your data to store datetimes in this format, load times will be
significantly faster, ~20x has been observed.
+.. deprecated:: 2.2.0
+ Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
+ on the relevant result columns instead.
+
Date parsing functions
++++++++++++++++++++++
@@ -1490,9 +1501,9 @@ rows will skip the intervening rows.
.. ipython:: python
- from pandas._testing import makeCustomDataframe as mkdf
-
- df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
+ mi_idx = pd.MultiIndex.from_arrays([[1, 2, 3, 4], list("abcd")], names=list("ab"))
+ mi_col = pd.MultiIndex.from_arrays([[1, 2], list("ab")], names=list("cd"))
+ df = pd.DataFrame(np.ones((4, 2)), index=mi_idx, columns=mi_col)
df.to_csv("mi.csv")
print(open("mi.csv").read())
pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1])
@@ -1811,8 +1822,8 @@ Writing JSON
A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json``
with optional parameters:
-* ``path_or_buf`` : the pathname or buffer to write the output
- This can be ``None`` in which case a JSON string is returned
+* ``path_or_buf`` : the pathname or buffer to write the output.
+ This can be ``None`` in which case a JSON string is returned.
* ``orient`` :
``Series``:
@@ -2332,7 +2343,7 @@ A few notes on the generated table schema:
.. ipython:: python
- s_per = pd.Series(1, index=pd.period_range("2016", freq="A-DEC", periods=4))
+ s_per = pd.Series(1, index=pd.period_range("2016", freq="Y-DEC", periods=4))
build_table_schema(s_per)
* Categoricals use the ``any`` type and an ``enum`` constraint listing
@@ -2448,7 +2459,7 @@ Read a URL with no options:
.. code-block:: ipython
- In [320]: "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list"
+ In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list"
In [321]: pd.read_html(url)
Out[321]:
[ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund
@@ -2619,7 +2630,7 @@ columns to strings.
.. code-block:: python
- url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code"
+ url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code?oldid=899173761"
dfs = pd.read_html(
url_mcc,
match="Telekom Albania",
@@ -2701,7 +2712,7 @@ in the method ``to_string`` described above.
.. note::
Not all of the possible options for ``DataFrame.to_html`` are shown here for
- brevity's sake. See :func:`~pandas.core.frame.DataFrame.to_html` for the
+ brevity's sake. See :func:`.DataFrame.to_html` for the
full set of options.
.. note::
@@ -3453,26 +3464,22 @@ Excel files
The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files
using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files
can be read using ``xlrd``. Binary Excel (``.xlsb``)
-files can be read using ``pyxlsb``.
+files can be read using ``pyxlsb``. All formats can be read
+using :ref:`calamine` engine.
The :meth:`~DataFrame.to_excel` instance method is used for
saving a ``DataFrame`` to Excel. Generally the semantics are
similar to working with :ref:`csv` data.
See the :ref:`cookbook` for some advanced strategies.
-.. warning::
-
- The `xlrd `__ package is now only for reading
- old-style ``.xls`` files.
+.. note::
- Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel`
- would result in using the ``xlrd`` engine in many cases, including new
- Excel 2007+ (``.xlsx``) files. pandas will now default to using the
- `openpyxl `__ engine.
+ When ``engine=None``, the following logic will be used to determine the engine:
- It is strongly encouraged to install ``openpyxl`` to read Excel 2007+
- (``.xlsx``) files.
- **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
- This is no longer supported, switch to using ``openpyxl`` instead.
+ - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+ then `odf `_ will be used.
+ - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used.
+ - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used.
+ - Otherwise ``openpyxl`` will be used.
.. _io.excel_reader:
@@ -3494,6 +3501,9 @@ using internally.
* For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files.
+* For the engine calamine, pandas is using :func:`python_calamine.load_workbook`
+ to read in (``.xlsx``), (``.xlsm``), (``.xls``), (``.xlsb``), (``.ods``) files.
+
.. code-block:: python
# Returns a DataFrame
@@ -3935,7 +3945,8 @@ The :func:`~pandas.read_excel` method can also read binary Excel files
using the ``pyxlsb`` module. The semantics and features for reading
binary Excel files mostly match what can be done for `Excel files`_ using
``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types
-in files and will return floats instead.
+in files and will return floats instead (you can use :ref:`calamine`
+if you need recognize datetime types).
.. code-block:: python
@@ -3947,6 +3958,20 @@ in files and will return floats instead.
Currently pandas only supports *reading* binary Excel files. Writing
is not implemented.
+.. _io.calamine:
+
+Calamine (Excel and ODS files)
+------------------------------
+
+The :func:`~pandas.read_excel` method can read Excel file (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``)
+and OpenDocument spreadsheets (``.ods``) using the ``python-calamine`` module.
+This module is a binding for Rust library `calamine `__
+and is faster than other engines in most cases. The optional dependency 'python-calamine' needs to be installed.
+
+.. code-block:: python
+
+ # Returns a DataFrame
+ pd.read_excel("path_to_file.xlsb", engine="calamine")
.. _io.clipboard:
@@ -4220,7 +4245,7 @@ similar to how ``read_csv`` and ``to_csv`` work.
.. ipython:: python
df_tl = pd.DataFrame({"A": list(range(5)), "B": list(range(5))})
- df_tl.to_hdf("store_tl.h5", "table", append=True)
+ df_tl.to_hdf("store_tl.h5", key="table", append=True)
pd.read_hdf("store_tl.h5", "table", where=["index>2"])
.. ipython:: python
@@ -4243,12 +4268,12 @@ HDFStore will by default not drop rows that are all missing. This behavior can b
)
df_with_missing
- df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w")
+ df_with_missing.to_hdf("file.h5", key="df_with_missing", format="table", mode="w")
pd.read_hdf("file.h5", "df_with_missing")
df_with_missing.to_hdf(
- "file.h5", "df_with_missing", format="table", mode="w", dropna=True
+ "file.h5", key="df_with_missing", format="table", mode="w", dropna=True
)
pd.read_hdf("file.h5", "df_with_missing")
@@ -4278,7 +4303,7 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for
.. ipython:: python
:okexcept:
- pd.DataFrame(np.random.randn(10, 2)).to_hdf("test_fixed.h5", "df")
+ pd.DataFrame(np.random.randn(10, 2)).to_hdf("test_fixed.h5", key="df")
pd.read_hdf("test_fixed.h5", "df", where="index>5")
.. ipython:: python
@@ -4881,7 +4906,7 @@ unspecified columns of the given DataFrame. The argument ``selector``
defines which table is the selector table (which you can make queries from).
The argument ``dropna`` will drop rows from the input ``DataFrame`` to ensure
tables are synchronized. This means that if a row for one of the tables
-being written to is entirely ``np.NaN``, that row will be dropped from all tables.
+being written to is entirely ``np.nan``, that row will be dropped from all tables.
If ``dropna`` is False, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**.
Remember that entirely ``np.Nan`` rows are not written to the HDFStore, so if
@@ -5352,7 +5377,6 @@ See the documentation for `pyarrow `__ an
Write to a parquet file.
.. ipython:: python
- :okwarning:
df.to_parquet("example_pa.parquet", engine="pyarrow")
df.to_parquet("example_fp.parquet", engine="fastparquet")
@@ -5360,7 +5384,6 @@ Write to a parquet file.
Read from a parquet file.
.. ipython:: python
- :okwarning:
result = pd.read_parquet("example_fp.parquet", engine="fastparquet")
result = pd.read_parquet("example_pa.parquet", engine="pyarrow")
@@ -5370,7 +5393,6 @@ Read from a parquet file.
By setting the ``dtype_backend`` argument you can control the default dtypes used for the resulting DataFrame.
.. ipython:: python
- :okwarning:
result = pd.read_parquet("example_pa.parquet", engine="pyarrow", dtype_backend="pyarrow")
@@ -5384,7 +5406,6 @@ By setting the ``dtype_backend`` argument you can control the default dtypes use
Read only certain columns of a parquet file.
.. ipython:: python
- :okwarning:
result = pd.read_parquet(
"example_fp.parquet",
@@ -5413,7 +5434,6 @@ Serializing a ``DataFrame`` to parquet may include the implicit index as one or
more columns in the output file. Thus, this code:
.. ipython:: python
- :okwarning:
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
df.to_parquet("test.parquet", engine="pyarrow")
@@ -5430,7 +5450,6 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to
:func:`~pandas.DataFrame.to_parquet`:
.. ipython:: python
- :okwarning:
df.to_parquet("test.parquet", index=False)
@@ -5453,7 +5472,6 @@ Partitioning Parquet files
Parquet supports partitioning of data based on the values of one or more columns.
.. ipython:: python
- :okwarning:
df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]})
df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None)
@@ -5519,14 +5537,12 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This
Write to an orc file.
.. ipython:: python
- :okwarning:
df.to_orc("example_pa.orc", engine="pyarrow")
Read from an orc file.
.. ipython:: python
- :okwarning:
result = pd.read_orc("example_pa.orc")
@@ -5555,9 +5571,23 @@ SQL queries
-----------
The :mod:`pandas.io.sql` module provides a collection of query wrappers to both
-facilitate data retrieval and to reduce dependency on DB-specific API. Database abstraction
-is provided by SQLAlchemy if installed. In addition you will need a driver library for
-your database. Examples of such drivers are `psycopg2 `__
+facilitate data retrieval and to reduce dependency on DB-specific API.
+
+Where available, users may first want to opt for `Apache Arrow ADBC
+`_ drivers. These drivers
+should provide the best performance, null handling, and type detection.
+
+ .. versionadded:: 2.2.0
+
+ Added native support for ADBC drivers
+
+For a full list of ADBC drivers and their development status, see the `ADBC Driver
+Implementation Status `_
+documentation.
+
+Where an ADBC driver is not available or may be missing functionality,
+users should opt for installing SQLAlchemy alongside their database driver library.
+Examples of such drivers are `psycopg2 `__
for PostgreSQL or `pymysql `__ for MySQL.
For `SQLite `__ this is
included in Python's standard library by default.
@@ -5590,6 +5620,18 @@ In the following example, we use the `SQlite
engine. You can use a temporary SQLite database where data are stored in
"memory".
+To connect using an ADBC driver you will want to install the ``adbc_driver_sqlite`` using your
+package manager. Once installed, you can use the DBAPI interface provided by the ADBC driver
+to connect to your database.
+
+.. code-block:: python
+
+ import adbc_driver_sqlite.dbapi as sqlite_dbapi
+
+ # Create the connection
+ with sqlite_dbapi.connect("sqlite:///:memory:") as conn:
+ df = pd.read_sql_table("data", conn)
+
To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine
object from database URI. You only need to create the engine once per database you are
connecting to.
@@ -5665,9 +5707,74 @@ writes ``data`` to the database in batches of 1000 rows at a time:
SQL data types
++++++++++++++
-:func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate
-SQL data type based on the dtype of the data. When you have columns of dtype
-``object``, pandas will try to infer the data type.
+Ensuring consistent data type management across SQL databases is challenging.
+Not every SQL database offers the same types, and even when they do the implementation
+of a given type can vary in ways that have subtle effects on how types can be
+preserved.
+
+For the best odds at preserving database types users are advised to use
+ADBC drivers when available. The Arrow type system offers a wider array of
+types that more closely match database types than the historical pandas/NumPy
+type system. To illustrate, note this (non-exhaustive) listing of types
+available in different databases and pandas backends:
+
++-----------------+-----------------------+----------------+---------+
+|numpy/pandas |arrow |postgres |sqlite |
++=================+=======================+================+=========+
+|int16/Int16 |int16 |SMALLINT |INTEGER |
++-----------------+-----------------------+----------------+---------+
+|int32/Int32 |int32 |INTEGER |INTEGER |
++-----------------+-----------------------+----------------+---------+
+|int64/Int64 |int64 |BIGINT |INTEGER |
++-----------------+-----------------------+----------------+---------+
+|float32 |float32 |REAL |REAL |
++-----------------+-----------------------+----------------+---------+
+|float64 |float64 |DOUBLE PRECISION|REAL |
++-----------------+-----------------------+----------------+---------+
+|object |string |TEXT |TEXT |
++-----------------+-----------------------+----------------+---------+
+|bool |``bool_`` |BOOLEAN | |
++-----------------+-----------------------+----------------+---------+
+|datetime64[ns] |timestamp(us) |TIMESTAMP | |
++-----------------+-----------------------+----------------+---------+
+|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | |
++-----------------+-----------------------+----------------+---------+
+| |date32 |DATE | |
++-----------------+-----------------------+----------------+---------+
+| |month_day_nano_interval|INTERVAL | |
++-----------------+-----------------------+----------------+---------+
+| |binary |BINARY |BLOB |
++-----------------+-----------------------+----------------+---------+
+| |decimal128 |DECIMAL [#f1]_ | |
++-----------------+-----------------------+----------------+---------+
+| |list |ARRAY [#f1]_ | |
++-----------------+-----------------------+----------------+---------+
+| |struct |COMPOSITE TYPE | |
+| | | [#f1]_ | |
++-----------------+-----------------------+----------------+---------+
+
+.. rubric:: Footnotes
+
+.. [#f1] Not implemented as of writing, but theoretically possible
+
+If you are interested in preserving database types as best as possible
+throughout the lifecycle of your DataFrame, users are encouraged to
+leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql`
+
+.. code-block:: ipython
+
+ # for roundtripping
+ with pg_dbapi.connect(uri) as conn:
+ df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow")
+
+This will prevent your data from being converted to the traditional pandas/NumPy
+type system, which often converts SQL types in ways that make them impossible to
+round-trip.
+
+In case an ADBC driver is not available, :func:`~pandas.DataFrame.to_sql`
+will try to map your data to an appropriate SQL data type based on the dtype of
+the data. When you have columns of dtype ``object``, pandas will try to infer
+the data type.
You can always override the default type by specifying the desired SQL type of
any of the columns by using the ``dtype`` argument. This argument needs a
@@ -5686,7 +5793,9 @@ default ``Text`` type for string columns:
Due to the limited support for timedelta's in the different database
flavors, columns with type ``timedelta64`` will be written as integer
- values as nanoseconds to the database and a warning will be raised.
+ values as nanoseconds to the database and a warning will be raised. The only
+ exception to this is when using the ADBC PostgreSQL driver in which case a
+ timedelta will be written to the database as an ``INTERVAL``
.. note::
@@ -5701,7 +5810,7 @@ default ``Text`` type for string columns:
Datetime data types
'''''''''''''''''''
-Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing
+Using ADBC or SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing
datetime data that is timezone naive or timezone aware. However, the resulting
data stored in the database ultimately depends on the supported data type
for datetime data of the database system being used.
@@ -5792,7 +5901,7 @@ table name and optionally a subset of columns to read.
.. note::
In order to use :func:`~pandas.read_sql_table`, you **must** have the
- SQLAlchemy optional dependency installed.
+ ADBC driver or SQLAlchemy optional dependency installed.
.. ipython:: python
@@ -5800,7 +5909,8 @@ table name and optionally a subset of columns to read.
.. note::
- Note that pandas infers column dtypes from query outputs, and not by looking
+ ADBC drivers will map database types directly back to arrow types. For other drivers
+ note that pandas infers column dtypes from query outputs, and not by looking
up data types in the physical database schema. For example, assume ``userid``
is an integer column in a table. Then, intuitively, ``select userid ...`` will
return integer-valued series, while ``select cast(userid as text) ...`` will
@@ -6001,7 +6111,7 @@ Stata format
Writing to stata format
'''''''''''''''''''''''
-The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame
+The method :func:`.DataFrame.to_stata` will write a DataFrame
into a .dta file. The format version of this file is always 115 (Stata 12).
.. ipython:: python
@@ -6041,7 +6151,7 @@ outside of this range, the variable is cast to ``int16``.
.. warning::
:class:`~pandas.io.stata.StataWriter` and
- :func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width
+ :func:`.DataFrame.to_stata` only support fixed width
strings containing up to 244 characters, a limitation imposed by the version
115 dta file format. Attempting to write *Stata* dta files with strings
longer than 244 characters raises a ``ValueError``.
@@ -6321,7 +6431,7 @@ The following test functions will be used below to compare the performance of se
def test_hdf_fixed_write(df):
- df.to_hdf("test_fixed.hdf", "test", mode="w")
+ df.to_hdf("test_fixed.hdf", key="test", mode="w")
def test_hdf_fixed_read():
@@ -6329,7 +6439,7 @@ The following test functions will be used below to compare the performance of se
def test_hdf_fixed_write_compress(df):
- df.to_hdf("test_fixed_compress.hdf", "test", mode="w", complib="blosc")
+ df.to_hdf("test_fixed_compress.hdf", key="test", mode="w", complib="blosc")
def test_hdf_fixed_read_compress():
@@ -6337,7 +6447,7 @@ The following test functions will be used below to compare the performance of se
def test_hdf_table_write(df):
- df.to_hdf("test_table.hdf", "test", mode="w", format="table")
+ df.to_hdf("test_table.hdf", key="test", mode="w", format="table")
def test_hdf_table_read():
@@ -6346,7 +6456,7 @@ The following test functions will be used below to compare the performance of se
def test_hdf_table_write_compress(df):
df.to_hdf(
- "test_table_compress.hdf", "test", mode="w", complib="blosc", format="table"
+ "test_table_compress.hdf", key="test", mode="w", complib="blosc", format="table"
)
diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
index 10793a6973f8a..c9c8478a719f0 100644
--- a/doc/source/user_guide/merging.rst
+++ b/doc/source/user_guide/merging.rst
@@ -15,27 +15,27 @@
Merge, join, concatenate and compare
************************************
-pandas provides various facilities for easily combining together Series or
-DataFrame with various kinds of set logic for the indexes
-and relational algebra functionality in the case of join / merge-type
-operations.
+pandas provides various methods for combining and comparing :class:`Series` or
+:class:`DataFrame`.
-In addition, pandas also provides utilities to compare two Series or DataFrame
-and summarize their differences.
+* :func:`~pandas.concat`: Merge multiple :class:`Series` or :class:`DataFrame` objects along a shared index or column
+* :meth:`DataFrame.join`: Merge multiple :class:`DataFrame` objects along the columns
+* :meth:`DataFrame.combine_first`: Update missing values with non-missing values in the same location
+* :func:`~pandas.merge`: Combine two :class:`Series` or :class:`DataFrame` objects with SQL-style joining
+* :func:`~pandas.merge_ordered`: Combine two :class:`Series` or :class:`DataFrame` objects along an ordered axis
+* :func:`~pandas.merge_asof`: Combine two :class:`Series` or :class:`DataFrame` objects by near instead of exact matching keys
+* :meth:`Series.compare` and :meth:`DataFrame.compare`: Show differences in values between two :class:`Series` or :class:`DataFrame` objects
.. _merging.concat:
-Concatenating objects
----------------------
-
-The :func:`~pandas.concat` function (in the main pandas namespace) does all of
-the heavy lifting of performing concatenation operations along an axis while
-performing optional set logic (union or intersection) of the indexes (if any) on
-the other axes. Note that I say "if any" because there is only a single possible
-axis of concatenation for Series.
+:func:`~pandas.concat`
+----------------------
-Before diving into all of the details of ``concat`` and what it can do, here is
-a simple example:
+The :func:`~pandas.concat` function concatenates an arbitrary amount of
+:class:`Series` or :class:`DataFrame` objects along an axis while
+performing optional set logic (union or intersection) of the indexes on
+the other axes. Like ``numpy.concatenate``, :func:`~pandas.concat`
+takes a list or dict of homogeneously-typed objects and concatenates them.
.. ipython:: python
@@ -71,6 +71,7 @@ a simple example:
frames = [df1, df2, df3]
result = pd.concat(frames)
+ result
.. ipython:: python
:suppress:
@@ -79,81 +80,12 @@ a simple example:
p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True);
plt.close("all");
-Like its sibling function on ndarrays, ``numpy.concatenate``, ``pandas.concat``
-takes a list or dict of homogeneously-typed objects and concatenates them with
-some configurable handling of "what to do with the other axes":
-
-::
-
- pd.concat(
- objs,
- axis=0,
- join="outer",
- ignore_index=False,
- keys=None,
- levels=None,
- names=None,
- verify_integrity=False,
- copy=True,
- )
-
-* ``objs`` : a sequence or mapping of Series or DataFrame objects. If a
- dict is passed, the sorted keys will be used as the ``keys`` argument, unless
- it is passed, in which case the values will be selected (see below). Any None
- objects will be dropped silently unless they are all None in which case a
- ValueError will be raised.
-* ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along.
-* ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on
- other axis(es). Outer for union and inner for intersection.
-* ``ignore_index`` : boolean, default False. If True, do not use the index
- values on the concatenation axis. The resulting axis will be labeled 0, ...,
- n - 1. This is useful if you are concatenating objects where the
- concatenation axis does not have meaningful indexing information. Note
- the index values on the other axes are still respected in the join.
-* ``keys`` : sequence, default None. Construct hierarchical index using the
- passed keys as the outermost level. If multiple levels passed, should
- contain tuples.
-* ``levels`` : list of sequences, default None. Specific levels (unique values)
- to use for constructing a MultiIndex. Otherwise they will be inferred from the
- keys.
-* ``names`` : list, default None. Names for the levels in the resulting
- hierarchical index.
-* ``verify_integrity`` : boolean, default False. Check whether the new
- concatenated axis contains duplicates. This can be very expensive relative
- to the actual data concatenation.
-* ``copy`` : boolean, default True. If False, do not copy data unnecessarily.
-
-Without a little bit of context many of these arguments don't make much sense.
-Let's revisit the above example. Suppose we wanted to associate specific keys
-with each of the pieces of the chopped up DataFrame. We can do this using the
-``keys`` argument:
-
-.. ipython:: python
-
- result = pd.concat(frames, keys=["x", "y", "z"])
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_concat_keys.png
- p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True)
- plt.close("all");
-
-As you can see (if you've read the rest of the documentation), the resulting
-object's index has a :ref:`hierarchical index `. This
-means that we can now select out each chunk by key:
-
-.. ipython:: python
-
- result.loc["y"]
-
-It's not a stretch to see how this can be very useful. More detail on this
-functionality below.
-
.. note::
- It is worth noting that :func:`~pandas.concat` makes a full copy of the data, and that constantly
- reusing this function can create a significant performance hit. If you need
- to use the operation over several datasets, use a list comprehension.
+
+ :func:`~pandas.concat` makes a full copy of the data, and iteratively
+ reusing :func:`~pandas.concat` can create unnecessary copies. Collect all
+ :class:`DataFrame` or :class:`Series` objects in a list before using
+ :func:`~pandas.concat`.
.. code-block:: python
@@ -162,26 +94,20 @@ functionality below.
.. note::
- When concatenating DataFrames with named axes, pandas will attempt to preserve
+ When concatenating :class:`DataFrame` with named axes, pandas will attempt to preserve
these index/column names whenever possible. In the case where all inputs share a
common name, this name will be assigned to the result. When the input names do
not all agree, the result will be unnamed. The same is true for :class:`MultiIndex`,
but the logic is applied separately on a level-by-level basis.
-Set logic on the other axes
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Joining logic of the resulting axis
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When gluing together multiple DataFrames, you have a choice of how to handle
-the other axes (other than the one being concatenated). This can be done in
-the following two ways:
+The ``join`` keyword specifies how to handle axis values that don't exist in the first
+:class:`DataFrame`.
-* Take the union of them all, ``join='outer'``. This is the default
- option as it results in zero information loss.
-* Take the intersection, ``join='inner'``.
-
-Here is an example of each of these methods. First, the default ``join='outer'``
-behavior:
+``join='outer'`` takes the union of all axis values
.. ipython:: python
@@ -194,6 +120,7 @@ behavior:
index=[2, 3, 6, 7],
)
result = pd.concat([df1, df4], axis=1)
+ result
.. ipython:: python
@@ -203,11 +130,12 @@ behavior:
p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False);
plt.close("all");
-Here is the same thing with ``join='inner'``:
+``join='inner'`` takes the intersection of the axis values
.. ipython:: python
result = pd.concat([df1, df4], axis=1, join="inner")
+ result
.. ipython:: python
:suppress:
@@ -216,18 +144,13 @@ Here is the same thing with ``join='inner'``:
p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False);
plt.close("all");
-Lastly, suppose we just wanted to reuse the *exact index* from the original
-DataFrame:
+To perform an effective "left" join using the *exact index* from the original
+:class:`DataFrame`, result can be reindexed.
.. ipython:: python
result = pd.concat([df1, df4], axis=1).reindex(df1.index)
-
-Similarly, we could index before the concatenation:
-
-.. ipython:: python
-
- pd.concat([df1, df4.reindex(df1.index)], axis=1)
+ result
.. ipython:: python
:suppress:
@@ -240,13 +163,14 @@ Similarly, we could index before the concatenation:
Ignoring indexes on the concatenation axis
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-For ``DataFrame`` objects which don't have a meaningful index, you may wish
-to append them and ignore the fact that they may have overlapping indexes. To
-do this, use the ``ignore_index`` argument:
+
+For :class:`DataFrame` objects which don't have a meaningful index, the ``ignore_index``
+ignores overlapping indexes.
.. ipython:: python
result = pd.concat([df1, df4], ignore_index=True, sort=False)
+ result
.. ipython:: python
:suppress:
@@ -257,17 +181,18 @@ do this, use the ``ignore_index`` argument:
.. _merging.mixed_ndims:
-Concatenating with mixed ndims
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Concatenating :class:`Series` and :class:`DataFrame` together
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-You can concatenate a mix of ``Series`` and ``DataFrame`` objects. The
-``Series`` will be transformed to ``DataFrame`` with the column name as
-the name of the ``Series``.
+You can concatenate a mix of :class:`Series` and :class:`DataFrame` objects. The
+:class:`Series` will be transformed to :class:`DataFrame` with the column name as
+the name of the :class:`Series`.
.. ipython:: python
s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X")
result = pd.concat([df1, s1], axis=1)
+ result
.. ipython:: python
:suppress:
@@ -276,19 +201,13 @@ the name of the ``Series``.
p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False);
plt.close("all");
-.. note::
-
- Since we're concatenating a ``Series`` to a ``DataFrame``, we could have
- achieved the same result with :meth:`DataFrame.assign`. To concatenate an
- arbitrary number of pandas objects (``DataFrame`` or ``Series``), use
- ``concat``.
-
-If unnamed ``Series`` are passed they will be numbered consecutively.
+Unnamed :class:`Series` will be numbered consecutively.
.. ipython:: python
s2 = pd.Series(["_0", "_1", "_2", "_3"])
result = pd.concat([df1, s2, s2, s2], axis=1)
+ result
.. ipython:: python
:suppress:
@@ -297,11 +216,12 @@ If unnamed ``Series`` are passed they will be numbered consecutively.
p.plot([df1, s2], result, labels=["df1", "s2"], vertical=False);
plt.close("all");
-Passing ``ignore_index=True`` will drop all name references.
+``ignore_index=True`` will drop all name references.
.. ipython:: python
result = pd.concat([df1, s1], axis=1, ignore_index=True)
+ result
.. ipython:: python
:suppress:
@@ -310,48 +230,45 @@ Passing ``ignore_index=True`` will drop all name references.
p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False);
plt.close("all");
-More concatenating with group keys
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Resulting ``keys``
+~~~~~~~~~~~~~~~~~~
-A fairly common use of the ``keys`` argument is to override the column names
-when creating a new ``DataFrame`` based on existing ``Series``.
-Notice how the default behaviour consists on letting the resulting ``DataFrame``
-inherit the parent ``Series``' name, when these existed.
+The ``keys`` argument adds another axis level to the resulting index or column (creating
+a :class:`MultiIndex`) associate specific keys with each original :class:`DataFrame`.
.. ipython:: python
- s3 = pd.Series([0, 1, 2, 3], name="foo")
- s4 = pd.Series([0, 1, 2, 3])
- s5 = pd.Series([0, 1, 4, 5])
-
- pd.concat([s3, s4, s5], axis=1)
-
-Through the ``keys`` argument we can override the existing column names.
+ result = pd.concat(frames, keys=["x", "y", "z"])
+ result
+ result.loc["y"]
.. ipython:: python
+ :suppress:
- pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"])
+ @savefig merging_concat_keys.png
+ p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True)
+ plt.close("all");
-Let's consider a variation of the very first example presented:
+The ``keys`` argument cane override the column names
+when creating a new :class:`DataFrame` based on existing :class:`Series`.
.. ipython:: python
- result = pd.concat(frames, keys=["x", "y", "z"])
-
-.. ipython:: python
- :suppress:
+ s3 = pd.Series([0, 1, 2, 3], name="foo")
+ s4 = pd.Series([0, 1, 2, 3])
+ s5 = pd.Series([0, 1, 4, 5])
- @savefig merging_concat_group_keys2.png
- p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True);
- plt.close("all");
+ pd.concat([s3, s4, s5], axis=1)
+ pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"])
-You can also pass a dict to ``concat`` in which case the dict keys will be used
-for the ``keys`` argument (unless other keys are specified):
+You can also pass a dict to :func:`concat` in which case the dict keys will be used
+for the ``keys`` argument unless other ``keys`` argument is specified:
.. ipython:: python
pieces = {"x": df1, "y": df2, "z": df3}
result = pd.concat(pieces)
+ result
.. ipython:: python
:suppress:
@@ -363,6 +280,7 @@ for the ``keys`` argument (unless other keys are specified):
.. ipython:: python
result = pd.concat(pieces, keys=["z", "y"])
+ result
.. ipython:: python
:suppress:
@@ -371,21 +289,21 @@ for the ``keys`` argument (unless other keys are specified):
p.plot([df1, df2, df3], result, labels=["df1", "df2", "df3"], vertical=True);
plt.close("all");
-The MultiIndex created has levels that are constructed from the passed keys and
-the index of the ``DataFrame`` pieces:
+The :class:`MultiIndex` created has levels that are constructed from the passed keys and
+the index of the :class:`DataFrame` pieces:
.. ipython:: python
result.index.levels
-If you wish to specify other levels (as will occasionally be the case), you can
-do so using the ``levels`` argument:
+``levels`` argument allows specifying resulting levels associated with the ``keys``
.. ipython:: python
result = pd.concat(
pieces, keys=["x", "y", "z"], levels=[["z", "y", "x", "w"]], names=["group_key"]
)
+ result
.. ipython:: python
:suppress:
@@ -398,21 +316,19 @@ do so using the ``levels`` argument:
result.index.levels
-This is fairly esoteric, but it is actually necessary for implementing things
-like GroupBy where the order of a categorical variable is meaningful.
-
.. _merging.append.row:
-Appending rows to a DataFrame
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Appending rows to a :class:`DataFrame`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-If you have a series that you want to append as a single row to a ``DataFrame``, you can convert the row into a
-``DataFrame`` and use ``concat``
+If you have a :class:`Series` that you want to append as a single row to a :class:`DataFrame`, you can convert the row into a
+:class:`DataFrame` and use :func:`concat`
.. ipython:: python
s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"])
result = pd.concat([df1, s2.to_frame().T], ignore_index=True)
+ result
.. ipython:: python
:suppress:
@@ -421,131 +337,35 @@ If you have a series that you want to append as a single row to a ``DataFrame``,
p.plot([df1, s2], result, labels=["df1", "s2"], vertical=True);
plt.close("all");
-You should use ``ignore_index`` with this method to instruct DataFrame to
-discard its index. If you wish to preserve the index, you should construct an
-appropriately-indexed DataFrame and append or concatenate those objects.
-
.. _merging.join:
-Database-style DataFrame or named Series joining/merging
---------------------------------------------------------
-
-pandas has full-featured, **high performance** in-memory join operations
-idiomatically very similar to relational databases like SQL. These methods
-perform significantly better (in some cases well over an order of magnitude
-better) than other open source implementations (like ``base::merge.data.frame``
-in R). The reason for this is careful algorithmic design and the internal layout
-of the data in ``DataFrame``.
-
-See the :ref:`cookbook` for some advanced strategies.
+:func:`~pandas.merge`
+---------------------
-Users who are familiar with SQL but new to pandas might be interested in a
+:func:`~pandas.merge` performs join operations similar to relational databases like SQL.
+Users who are familiar with SQL but new to pandas can reference a
:ref:`comparison with SQL`.
-pandas provides a single function, :func:`~pandas.merge`, as the entry point for
-all standard database join operations between ``DataFrame`` or named ``Series`` objects:
-
-::
-
- pd.merge(
- left,
- right,
- how="inner",
- on=None,
- left_on=None,
- right_on=None,
- left_index=False,
- right_index=False,
- sort=True,
- suffixes=("_x", "_y"),
- copy=True,
- indicator=False,
- validate=None,
- )
+Merge types
+~~~~~~~~~~~
+
+:func:`~pandas.merge` implements common SQL style joining operations.
-* ``left``: A DataFrame or named Series object.
-* ``right``: Another DataFrame or named Series object.
-* ``on``: Column or index level names to join on. Must be found in both the left
- and right DataFrame and/or Series objects. If not passed and ``left_index`` and
- ``right_index`` are ``False``, the intersection of the columns in the
- DataFrames and/or Series will be inferred to be the join keys.
-* ``left_on``: Columns or index levels from the left DataFrame or Series to use as
- keys. Can either be column names, index level names, or arrays with length
- equal to the length of the DataFrame or Series.
-* ``right_on``: Columns or index levels from the right DataFrame or Series to use as
- keys. Can either be column names, index level names, or arrays with length
- equal to the length of the DataFrame or Series.
-* ``left_index``: If ``True``, use the index (row labels) from the left
- DataFrame or Series as its join key(s). In the case of a DataFrame or Series with a MultiIndex
- (hierarchical), the number of levels must match the number of join keys
- from the right DataFrame or Series.
-* ``right_index``: Same usage as ``left_index`` for the right DataFrame or Series
-* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``, ``'cross'``. Defaults
- to ``inner``. See below for more detailed description of each method.
-* ``sort``: Sort the result DataFrame by the join keys in lexicographical
- order. Defaults to ``True``, setting to ``False`` will improve performance
- substantially in many cases.
-* ``suffixes``: A tuple of string suffixes to apply to overlapping
- columns. Defaults to ``('_x', '_y')``.
-* ``copy``: Always copy data (default ``True``) from the passed DataFrame or named Series
- objects, even when reindexing is not necessary. Cannot be avoided in many
- cases but may improve performance / memory usage. The cases where copying
- can be avoided are somewhat pathological but this option is provided
- nonetheless.
-* ``indicator``: Add a column to the output DataFrame called ``_merge``
- with information on the source of each row. ``_merge`` is Categorical-type
- and takes on a value of ``left_only`` for observations whose merge key
- only appears in ``'left'`` DataFrame or Series, ``right_only`` for observations whose
- merge key only appears in ``'right'`` DataFrame or Series, and ``both`` if the
- observation's merge key is found in both.
-
-* ``validate`` : string, default None.
- If specified, checks if merge is of specified type.
-
- * "one_to_one" or "1:1": checks if merge keys are unique in both
- left and right datasets.
- * "one_to_many" or "1:m": checks if merge keys are unique in left
- dataset.
- * "many_to_one" or "m:1": checks if merge keys are unique in right
- dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
-
-The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` or named ``Series``
-and ``right`` is a subclass of ``DataFrame``, the return type will still be ``DataFrame``.
-
-``merge`` is a function in the pandas namespace, and it is also available as a
-``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling
-``DataFrame`` being implicitly considered the left object in the join.
-
-The related :meth:`~DataFrame.join` method, uses ``merge`` internally for the
-index-on-index (by default) and column(s)-on-index join. If you are joining on
-index only, you may wish to use ``DataFrame.join`` to save yourself some typing.
-
-Brief primer on merge methods (relational algebra)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Experienced users of relational databases like SQL will be familiar with the
-terminology used to describe join operations between two SQL-table like
-structures (``DataFrame`` objects). There are several cases to consider which
-are very important to understand:
-
-* **one-to-one** joins: for example when joining two ``DataFrame`` objects on
- their indexes (which must contain unique values).
-* **many-to-one** joins: for example when joining an index (unique) to one or
- more columns in a different ``DataFrame``.
-* **many-to-many** joins: joining columns on columns.
+* **one-to-one**: joining two :class:`DataFrame` objects on
+ their indexes which must contain unique values.
+* **many-to-one**: joining a unique index to one or
+ more columns in a different :class:`DataFrame`.
+* **many-to-many** : joining columns on columns.
.. note::
- When joining columns on columns (potentially a many-to-many join), any
- indexes on the passed ``DataFrame`` objects **will be discarded**.
+ When joining columns on columns, potentially a many-to-many join, any
+ indexes on the passed :class:`DataFrame` objects **will be discarded**.
-It is worth spending some time understanding the result of the **many-to-many**
-join case. In SQL / standard relational algebra, if a key combination appears
-more than once in both tables, the resulting table will have the **Cartesian
-product** of the associated data. Here is a very basic example with one unique
-key combination:
+For a **many-to-many** join, if a key combination appears
+more than once in both tables, the :class:`DataFrame` will have the **Cartesian
+product** of the associated data.
.. ipython:: python
@@ -565,6 +385,7 @@ key combination:
}
)
result = pd.merge(left, right, on="key")
+ result
.. ipython:: python
:suppress:
@@ -573,41 +394,8 @@ key combination:
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");
-Here is a more complicated example with multiple join keys. Only the keys
-appearing in ``left`` and ``right`` are present (the intersection), since
-``how='inner'`` by default.
-
-.. ipython:: python
-
- left = pd.DataFrame(
- {
- "key1": ["K0", "K0", "K1", "K2"],
- "key2": ["K0", "K1", "K0", "K1"],
- "A": ["A0", "A1", "A2", "A3"],
- "B": ["B0", "B1", "B2", "B3"],
- }
- )
-
- right = pd.DataFrame(
- {
- "key1": ["K0", "K1", "K1", "K2"],
- "key2": ["K0", "K0", "K0", "K0"],
- "C": ["C0", "C1", "C2", "C3"],
- "D": ["D0", "D1", "D2", "D3"],
- }
- )
-
- result = pd.merge(left, right, on=["key1", "key2"])
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_merge_on_key_multiple.png
- p.plot([left, right], result, labels=["left", "right"], vertical=False);
- plt.close("all");
-
-The ``how`` argument to ``merge`` specifies how to determine which keys are to
-be included in the resulting table. If a key combination **does not appear** in
+The ``how`` argument to :func:`~pandas.merge` specifies which keys are
+included in the resulting table. If a key combination **does not appear** in
either the left or right tables, the values in the joined table will be
``NA``. Here is a summary of the ``how`` options and their SQL equivalent names:
@@ -623,7 +411,24 @@ either the left or right tables, the values in the joined table will be
.. ipython:: python
+ left = pd.DataFrame(
+ {
+ "key1": ["K0", "K0", "K1", "K2"],
+ "key2": ["K0", "K1", "K0", "K1"],
+ "A": ["A0", "A1", "A2", "A3"],
+ "B": ["B0", "B1", "B2", "B3"],
+ }
+ )
+ right = pd.DataFrame(
+ {
+ "key1": ["K0", "K1", "K1", "K2"],
+ "key2": ["K0", "K0", "K0", "K0"],
+ "C": ["C0", "C1", "C2", "C3"],
+ "D": ["D0", "D1", "D2", "D3"],
+ }
+ )
result = pd.merge(left, right, how="left", on=["key1", "key2"])
+ result
.. ipython:: python
:suppress:
@@ -635,6 +440,7 @@ either the left or right tables, the values in the joined table will be
.. ipython:: python
result = pd.merge(left, right, how="right", on=["key1", "key2"])
+ result
.. ipython:: python
:suppress:
@@ -645,6 +451,7 @@ either the left or right tables, the values in the joined table will be
.. ipython:: python
result = pd.merge(left, right, how="outer", on=["key1", "key2"])
+ result
.. ipython:: python
:suppress:
@@ -656,6 +463,7 @@ either the left or right tables, the values in the joined table will be
.. ipython:: python
result = pd.merge(left, right, how="inner", on=["key1", "key2"])
+ result
.. ipython:: python
:suppress:
@@ -667,6 +475,7 @@ either the left or right tables, the values in the joined table will be
.. ipython:: python
result = pd.merge(left, right, how="cross")
+ result
.. ipython:: python
:suppress:
@@ -675,10 +484,9 @@ either the left or right tables, the values in the joined table will be
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");
-You can merge a mult-indexed Series and a DataFrame, if the names of
-the MultiIndex correspond to the columns from the DataFrame. Transform
-the Series to a DataFrame using :meth:`Series.reset_index` before merging,
-as shown in the following example.
+You can :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of
+the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. Transform
+the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` before merging
.. ipython:: python
@@ -696,7 +504,7 @@ as shown in the following example.
pd.merge(df, ser.reset_index(), on=["Let", "Num"])
-Here is another example with duplicate join keys in DataFrames:
+Performing an outer join with duplicate join keys in :class:`DataFrame`
.. ipython:: python
@@ -705,6 +513,7 @@ Here is another example with duplicate join keys in DataFrames:
right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]})
result = pd.merge(left, right, on="B", how="outer")
+ result
.. ipython:: python
:suppress:
@@ -716,21 +525,17 @@ Here is another example with duplicate join keys in DataFrames:
.. warning::
- Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
+ Merging on duplicate keys significantly increase the dimensions of the result
+ and can cause a memory overflow.
.. _merging.validation:
-Checking for duplicate keys
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Users can use the ``validate`` argument to automatically check whether there
-are unexpected duplicates in their merge keys. Key uniqueness is checked before
-merge operations and so should protect against memory overflows. Checking key
-uniqueness is also a good way to ensure user data structures are as expected.
+Merge key uniqueness
+~~~~~~~~~~~~~~~~~~~~
-In the following example, there are duplicate values of ``B`` in the right
-``DataFrame``. As this is not a one-to-one merge -- as specified in the
-``validate`` argument -- an exception will be raised.
+The ``validate`` argument checks whether the uniqueness of merge keys.
+Key uniqueness is checked before merge operations and can protect against memory overflows
+and unexpected key duplication.
.. ipython:: python
:okexcept:
@@ -739,8 +544,8 @@ In the following example, there are duplicate values of ``B`` in the right
right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]})
result = pd.merge(left, right, on="B", how="outer", validate="one_to_one")
-If the user is aware of the duplicates in the right ``DataFrame`` but wants to
-ensure there are no duplicates in the left DataFrame, one can use the
+If the user is aware of the duplicates in the right :class:`DataFrame` but wants to
+ensure there are no duplicates in the left :class:`DataFrame`, one can use the
``validate='one_to_many'`` argument instead, which will not raise an exception.
.. ipython:: python
@@ -750,8 +555,8 @@ ensure there are no duplicates in the left DataFrame, one can use the
.. _merging.indicator:
-The merge indicator
-~~~~~~~~~~~~~~~~~~~
+Merge result indicator
+~~~~~~~~~~~~~~~~~~~~~~
:func:`~pandas.merge` accepts the argument ``indicator``. If ``True``, a
Categorical-type column called ``_merge`` will be added to the output object
@@ -771,97 +576,53 @@ that takes on values:
df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]})
pd.merge(df1, df2, on="col1", how="outer", indicator=True)
-The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column.
+A string argument to ``indicator`` will use the value as the name for the indicator column.
.. ipython:: python
pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column")
-.. _merging.dtypes:
-
-Merge dtypes
-~~~~~~~~~~~~
-
-Merging will preserve the dtype of the join keys.
-
-.. ipython:: python
-
- left = pd.DataFrame({"key": [1], "v1": [10]})
- left
- right = pd.DataFrame({"key": [1, 2], "v1": [20, 30]})
- right
-
-We are able to preserve the join keys:
-
-.. ipython:: python
-
- pd.merge(left, right, how="outer")
- pd.merge(left, right, how="outer").dtypes
-
-Of course if you have missing values that are introduced, then the
-resulting dtype will be upcast.
-
-.. ipython:: python
-
- pd.merge(left, right, how="outer", on="key")
- pd.merge(left, right, how="outer", on="key").dtypes
-
-Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals `.
+Overlapping value columns
+~~~~~~~~~~~~~~~~~~~~~~~~~
-The left frame.
+The merge ``suffixes`` argument takes a tuple of list of strings to append to
+overlapping column names in the input :class:`DataFrame` to disambiguate the result
+columns:
.. ipython:: python
- from pandas.api.types import CategoricalDtype
-
- X = pd.Series(np.random.choice(["foo", "bar"], size=(10,)))
- X = X.astype(CategoricalDtype(categories=["foo", "bar"]))
-
- left = pd.DataFrame(
- {"X": X, "Y": np.random.choice(["one", "two", "three"], size=(10,))}
- )
- left
- left.dtypes
+ left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]})
+ right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]})
-The right frame.
+ result = pd.merge(left, right, on="k")
+ result
.. ipython:: python
+ :suppress:
- right = pd.DataFrame(
- {
- "X": pd.Series(["foo", "bar"], dtype=CategoricalDtype(["foo", "bar"])),
- "Z": [1, 2],
- }
- )
- right
- right.dtypes
-
-The merged result:
+ @savefig merging_merge_overlapped.png
+ p.plot([left, right], result, labels=["left", "right"], vertical=False);
+ plt.close("all");
.. ipython:: python
- result = pd.merge(left, right, how="outer")
+ result = pd.merge(left, right, on="k", suffixes=("_l", "_r"))
result
- result.dtypes
-
-.. note::
- The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute.
- Otherwise the result will coerce to the categories' dtype.
-
-.. note::
-
- Merging on ``category`` dtypes that are the same can be quite performant compared to ``object`` dtype merging.
+.. ipython:: python
+ :suppress:
-.. _merging.join.index:
+ @savefig merging_merge_overlapped_suffix.png
+ p.plot([left, right], result, labels=["left", "right"], vertical=False);
+ plt.close("all");
-Joining on index
-~~~~~~~~~~~~~~~~
+:meth:`DataFrame.join`
+----------------------
-:meth:`DataFrame.join` is a convenient method for combining the columns of two
-potentially differently-indexed ``DataFrames`` into a single result
-``DataFrame``. Here is a very basic example:
+:meth:`DataFrame.join` combines the columns of multiple,
+potentially differently-indexed :class:`DataFrame` into a single result
+:class:`DataFrame`.
.. ipython:: python
@@ -874,6 +635,7 @@ potentially differently-indexed ``DataFrames`` into a single result
)
result = left.join(right)
+ result
.. ipython:: python
:suppress:
@@ -885,6 +647,7 @@ potentially differently-indexed ``DataFrames`` into a single result
.. ipython:: python
result = left.join(right, how="outer")
+ result
.. ipython:: python
:suppress:
@@ -893,11 +656,10 @@ potentially differently-indexed ``DataFrames`` into a single result
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");
-The same as above, but with ``how='inner'``.
-
.. ipython:: python
result = left.join(right, how="inner")
+ result
.. ipython:: python
:suppress:
@@ -906,50 +668,9 @@ The same as above, but with ``how='inner'``.
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");
-The data alignment here is on the indexes (row labels). This same behavior can
-be achieved using ``merge`` plus additional arguments instructing it to use the
-indexes:
-
-.. ipython:: python
-
- result = pd.merge(left, right, left_index=True, right_index=True, how="outer")
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_merge_index_outer.png
- p.plot([left, right], result, labels=["left", "right"], vertical=False);
- plt.close("all");
-
-.. ipython:: python
-
- result = pd.merge(left, right, left_index=True, right_index=True, how="inner")
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_merge_index_inner.png
- p.plot([left, right], result, labels=["left", "right"], vertical=False);
- plt.close("all");
-
-Joining key columns on an index
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-:meth:`~DataFrame.join` takes an optional ``on`` argument which may be a column
-or multiple column names, which specifies that the passed ``DataFrame`` is to be
-aligned on that column in the ``DataFrame``. These two function calls are
-completely equivalent:
-
-::
-
- left.join(right, on=key_or_keys)
- pd.merge(
- left, right, left_on=key_or_keys, right_index=True, how="left", sort=False
- )
-
-Obviously you can choose whichever form you find more convenient. For
-many-to-one joins (where one of the ``DataFrame``'s is already indexed by the
-join key), using ``join`` may be more convenient. Here is a simple example:
+:meth:`DataFrame.join` takes an optional ``on`` argument which may be a column
+or multiple column names that the passed :class:`DataFrame` is to be
+aligned.
.. ipython:: python
@@ -964,6 +685,7 @@ join key), using ``join`` may be more convenient. Here is a simple example:
right = pd.DataFrame({"C": ["C0", "C1"], "D": ["D0", "D1"]}, index=["K0", "K1"])
result = left.join(right, on="key")
+ result
.. ipython:: python
:suppress:
@@ -977,6 +699,7 @@ join key), using ``join`` may be more convenient. Here is a simple example:
result = pd.merge(
left, right, left_on="key", right_index=True, how="left", sort=False
)
+ result
.. ipython:: python
:suppress:
@@ -987,7 +710,7 @@ join key), using ``join`` may be more convenient. Here is a simple example:
.. _merging.multikey_join:
-To join on multiple keys, the passed DataFrame must have a ``MultiIndex``:
+To join on multiple keys, the passed :class:`DataFrame` must have a :class:`MultiIndex`:
.. ipython:: python
@@ -1006,12 +729,8 @@ To join on multiple keys, the passed DataFrame must have a ``MultiIndex``:
right = pd.DataFrame(
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=index
)
-
-Now this can be joined by passing the two key column names:
-
-.. ipython:: python
-
result = left.join(right, on=["key1", "key2"])
+ result
.. ipython:: python
:suppress:
@@ -1022,14 +741,14 @@ Now this can be joined by passing the two key column names:
.. _merging.df_inner_join:
-The default for ``DataFrame.join`` is to perform a left join (essentially a
-"VLOOKUP" operation, for Excel users), which uses only the keys found in the
-calling DataFrame. Other join types, for example inner join, can be just as
-easily performed:
+The default for :class:`DataFrame.join` is to perform a left join
+which uses only the keys found in the
+calling :class:`DataFrame`. Other join types can be specified with ``how``.
.. ipython:: python
result = left.join(right, on=["key1", "key2"], how="inner")
+ result
.. ipython:: python
:suppress:
@@ -1038,16 +757,13 @@ easily performed:
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");
-As you can see, this drops any rows where there was no match.
-
.. _merging.join_on_mi:
Joining a single Index to a MultiIndex
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-You can join a singly-indexed ``DataFrame`` with a level of a MultiIndexed ``DataFrame``.
-The level will match on the name of the index of the singly-indexed frame against
-a level name of the MultiIndexed frame.
+You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level.
+The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`.
.. ipython:: python
@@ -1066,6 +782,7 @@ a level name of the MultiIndexed frame.
)
result = left.join(right, how="inner")
+ result
.. ipython:: python
@@ -1075,29 +792,13 @@ a level name of the MultiIndexed frame.
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");
-This is equivalent but less verbose and more memory efficient / faster than this.
-
-.. ipython:: python
-
- result = pd.merge(
- left.reset_index(), right.reset_index(), on=["key"], how="inner"
- ).set_index(["key","Y"])
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_merge_multiindex_alternative.png
- p.plot([left, right], result, labels=["left", "right"], vertical=False);
- plt.close("all");
-
.. _merging.join_with_two_multi_indexes:
-Joining with two MultiIndexes
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Joining with two :class:`MultiIndex`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-This is supported in a limited way, provided that the index for the right
-argument is completely used in the join, and is a subset of the indices in
-the left argument, as in this example:
+The :class:`MultiIndex` of the input argument must be completely used
+in the join and is a subset of the indices in the left argument.
.. ipython:: python
@@ -1115,9 +816,6 @@ the left argument, as in this example:
left.join(right, on=["abc", "xy"], how="inner")
-If that condition is not satisfied, a join with two multi-indexes can be
-done using the following code.
-
.. ipython:: python
leftindex = pd.MultiIndex.from_tuples(
@@ -1137,6 +835,7 @@ done using the following code.
result = pd.merge(
left.reset_index(), right.reset_index(), on=["key"], how="inner"
).set_index(["key", "X", "Y"])
+ result
.. ipython:: python
:suppress:
@@ -1152,7 +851,7 @@ Merging on a combination of columns and index levels
Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters
may refer to either column names or index level names. This enables merging
-``DataFrame`` instances on a combination of index levels and columns without
+:class:`DataFrame` instances on a combination of index levels and columns without
resetting indexes.
.. ipython:: python
@@ -1180,6 +879,7 @@ resetting indexes.
)
result = left.merge(right, on=["key1", "key2"])
+ result
.. ipython:: python
:suppress:
@@ -1190,76 +890,23 @@ resetting indexes.
.. note::
- When DataFrames are merged on a string that matches an index level in both
- frames, the index level is preserved as an index level in the resulting
- DataFrame.
-
-.. note::
- When DataFrames are merged using only some of the levels of a ``MultiIndex``,
- the extra levels will be dropped from the resulting merge. In order to
- preserve those levels, use ``reset_index`` on those level names to move
- those levels to columns prior to doing the merge.
+ When :class:`DataFrame` are joined on a string that matches an index level in both
+ arguments, the index level is preserved as an index level in the resulting
+ :class:`DataFrame`.
.. note::
- If a string matches both a column name and an index level name, then a
- warning is issued and the column takes precedence. This will result in an
- ambiguity error in a future version.
-
-Overlapping value columns
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The merge ``suffixes`` argument takes a tuple of list of strings to append to
-overlapping column names in the input ``DataFrame``\ s to disambiguate the result
-columns:
-
-.. ipython:: python
-
- left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]})
- right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]})
-
- result = pd.merge(left, right, on="k")
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_merge_overlapped.png
- p.plot([left, right], result, labels=["left", "right"], vertical=False);
- plt.close("all");
-
-.. ipython:: python
-
- result = pd.merge(left, right, on="k", suffixes=("_l", "_r"))
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_merge_overlapped_suffix.png
- p.plot([left, right], result, labels=["left", "right"], vertical=False);
- plt.close("all");
-
-:meth:`DataFrame.join` has ``lsuffix`` and ``rsuffix`` arguments which behave
-similarly.
-
-.. ipython:: python
-
- left = left.set_index("k")
- right = right.set_index("k")
- result = left.join(right, lsuffix="_l", rsuffix="_r")
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_merge_overlapped_multi_suffix.png
- p.plot([left, right], result, labels=["left", "right"], vertical=False);
- plt.close("all");
+ When :class:`DataFrame` are joined using only some of the levels of a :class:`MultiIndex`,
+ the extra levels will be dropped from the resulting join. To
+ preserve those levels, use :meth:`DataFrame.reset_index` on those level
+ names to move those levels to columns prior to the join.
.. _merging.multiple_join:
-Joining multiple DataFrames
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Joining multiple :class:`DataFrame`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-A list or tuple of ``DataFrames`` can also be passed to :meth:`~DataFrame.join`
+A list or tuple of ``:class:`DataFrame``` can also be passed to :meth:`~DataFrame.join`
to join them together on their indexes.
.. ipython:: python
@@ -1281,12 +928,12 @@ to join them together on their indexes.
.. _merging.combine_first.update:
-Merging together values within Series or DataFrame columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+:meth:`DataFrame.combine_first`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Another fairly common situation is to have two like-indexed (or similarly
-indexed) ``Series`` or ``DataFrame`` objects and wanting to "patch" values in
-one object from values for matching indices in the other. Here is an example:
+:meth:`DataFrame.combine_first` update missing values from one :class:`DataFrame`
+with the non-missing values in another :class:`DataFrame` in the corresponding
+location.
.. ipython:: python
@@ -1294,12 +941,8 @@ one object from values for matching indices in the other. Here is an example:
[[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]]
)
df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2])
-
-For this, use the :meth:`~DataFrame.combine_first` method:
-
-.. ipython:: python
-
result = df1.combine_first(df2)
+ result
.. ipython:: python
:suppress:
@@ -1308,39 +951,13 @@ For this, use the :meth:`~DataFrame.combine_first` method:
p.plot([df1, df2], result, labels=["df1", "df2"], vertical=False);
plt.close("all");
-Note that this method only takes values from the right ``DataFrame`` if they are
-missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`,
-alters non-NA values in place:
-
-.. ipython:: python
- :suppress:
-
- df1_copy = df1.copy()
-
-.. ipython:: python
-
- df1.update(df2)
-
-.. ipython:: python
- :suppress:
-
- @savefig merging_update.png
- p.plot([df1_copy, df2], df1, labels=["df1", "df2"], vertical=False);
- plt.close("all");
-
-.. _merging.time_series:
-
-Timeseries friendly merging
----------------------------
-
.. _merging.merge_ordered:
-Merging ordered data
-~~~~~~~~~~~~~~~~~~~~
+:func:`merge_ordered`
+---------------------
-A :func:`merge_ordered` function allows combining time series and other
-ordered data. In particular it has an optional ``fill_method`` keyword to
-fill/interpolate missing data:
+:func:`merge_ordered` combines order data such as numeric or time series data
+with optional filling of missing data with ``fill_method``.
.. ipython:: python
@@ -1354,19 +971,16 @@ fill/interpolate missing data:
.. _merging.merge_asof:
-Merging asof
-~~~~~~~~~~~~
-
-A :func:`merge_asof` is similar to an ordered left-join except that we match on
-nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``,
-we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less
-than the left's key. Both DataFrames must be sorted by the key.
+:func:`merge_asof`
+---------------------
-Optionally an asof merge can perform a group-wise merge. This matches the
-``by`` key equally, in addition to the nearest match on the ``on`` key.
+:func:`merge_asof` is similar to an ordered left-join except that mactches are on the
+nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`,
+the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less
+than the left's key. Both :class:`DataFrame` must be sorted by the key.
-For example; we might have ``trades`` and ``quotes`` and we want to ``asof``
-merge them.
+Optionally an :func:`merge_asof` can perform a group-wise merge by matching the
+``by`` key in addition to the nearest match on the ``on`` key.
.. ipython:: python
@@ -1408,25 +1022,17 @@ merge them.
},
columns=["time", "ticker", "bid", "ask"],
)
-
-.. ipython:: python
-
trades
quotes
-
-By default we are taking the asof of the quotes.
-
-.. ipython:: python
-
pd.merge_asof(trades, quotes, on="time", by="ticker")
-We only asof within ``2ms`` between the quote time and the trade time.
+:func:`merge_asof` within ``2ms`` between the quote time and the trade time.
.. ipython:: python
pd.merge_asof(trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms"))
-We only asof within ``10ms`` between the quote time and the trade time and we
+:func:`merge_asof` within ``10ms`` between the quote time and the trade time and
exclude exact matches on time. Note that though we exclude the exact matches
(of the quotes), prior quotes **do** propagate to that point in time.
@@ -1443,14 +1049,11 @@ exclude exact matches on time. Note that though we exclude the exact matches
.. _merging.compare:
-Comparing objects
------------------
-
-The :meth:`~Series.compare` and :meth:`~DataFrame.compare` methods allow you to
-compare two DataFrame or Series, respectively, and summarize their differences.
+:meth:`~Series.compare`
+-----------------------
-For example, you might want to compare two ``DataFrame`` and stack their differences
-side by side.
+The :meth:`Series.compare` and :meth:`DataFrame.compare` methods allow you to
+compare two :class:`DataFrame` or :class:`Series`, respectively, and summarize their differences.
.. ipython:: python
@@ -1463,36 +1066,29 @@ side by side.
columns=["col1", "col2", "col3"],
)
df
-
-.. ipython:: python
-
df2 = df.copy()
df2.loc[0, "col1"] = "c"
df2.loc[2, "col3"] = 4.0
df2
-
-.. ipython:: python
-
df.compare(df2)
By default, if two corresponding values are equal, they will be shown as ``NaN``.
Furthermore, if all values in an entire row / column, the row / column will be
omitted from the result. The remaining differences will be aligned on columns.
-If you wish, you may choose to stack the differences on rows.
+Stack the differences on rows.
.. ipython:: python
df.compare(df2, align_axis=0)
-If you wish to keep all original rows and columns, set ``keep_shape`` argument
-to ``True``.
+Keep all original rows and columns with ``keep_shape=True``
.. ipython:: python
df.compare(df2, keep_shape=True)
-You may also keep all the original values even if they are equal.
+Keep all the original values even if they are equal.
.. ipython:: python
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index 9d645c185d3b5..4a2aa565dd15c 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -6,350 +6,462 @@
Working with missing data
*************************
-In this section, we will discuss missing (also referred to as NA) values in
-pandas.
+Values considered "missing"
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. note::
+pandas uses different sentinel values to represent a missing (also referred to as NA)
+depending on the data type.
- The choice of using ``NaN`` internally to denote missing data was largely
- for simplicity and performance reasons.
- Starting from pandas 1.0, some optional data types start experimenting
- with a native ``NA`` scalar using a mask-based approach. See
- :ref:`here ` for more.
+``numpy.nan`` for NumPy data types. The disadvantage of using NumPy data types
+is that the original data type will be coerced to ``np.float64`` or ``object``.
-See the :ref:`cookbook` for some advanced strategies.
+.. ipython:: python
-Values considered "missing"
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ pd.Series([1, 2], dtype=np.int64).reindex([0, 1, 2])
+ pd.Series([True, False], dtype=np.bool_).reindex([0, 1, 2])
+
+:class:`NaT` for NumPy ``np.datetime64``, ``np.timedelta64``, and :class:`PeriodDtype`. For typing applications,
+use :class:`api.types.NaTType`.
-As data comes in many shapes and forms, pandas aims to be flexible with regard
-to handling missing data. While ``NaN`` is the default missing value marker for
-reasons of computational speed and convenience, we need to be able to easily
-detect this value with data of different types: floating point, integer,
-boolean, and general object. In many cases, however, the Python ``None`` will
-arise and we wish to also consider that "missing" or "not available" or "NA".
+.. ipython:: python
+
+ pd.Series([1, 2], dtype=np.dtype("timedelta64[ns]")).reindex([0, 1, 2])
+ pd.Series([1, 2], dtype=np.dtype("datetime64[ns]")).reindex([0, 1, 2])
+ pd.Series(["2020", "2020"], dtype=pd.PeriodDtype("D")).reindex([0, 1, 2])
-.. _missing.isna:
+:class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths),
+:class:`Float64Dtype`(and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`.
+These types will maintain the original data type of the data.
+For typing applications, use :class:`api.types.NAType`.
.. ipython:: python
- df = pd.DataFrame(
- np.random.randn(5, 3),
- index=["a", "c", "e", "f", "h"],
- columns=["one", "two", "three"],
- )
- df["four"] = "bar"
- df["five"] = df["one"] > 0
- df
- df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
- df2
+ pd.Series([1, 2], dtype="Int64").reindex([0, 1, 2])
+ pd.Series([True, False], dtype="boolean[pyarrow]").reindex([0, 1, 2])
-To make detecting missing values easier (and across different array dtypes),
-pandas provides the :func:`isna` and
-:func:`notna` functions, which are also methods on
-Series and DataFrame objects:
+To detect these missing value, use the :func:`isna` or :func:`notna` methods.
.. ipython:: python
- df2["one"]
- pd.isna(df2["one"])
- df2["four"].notna()
- df2.isna()
+ ser = pd.Series([pd.Timestamp("2020-01-01"), pd.NaT])
+ ser
+ pd.isna(ser)
+
+
+.. note::
+
+ :func:`isna` or :func:`notna` will also consider ``None`` a missing value.
+
+ .. ipython:: python
+
+ ser = pd.Series([1, None], dtype=object)
+ ser
+ pd.isna(ser)
.. warning::
- One has to be mindful that in Python (and NumPy), the ``nan's`` don't compare equal, but ``None's`` **do**.
- Note that pandas/NumPy uses the fact that ``np.nan != np.nan``, and treats ``None`` like ``np.nan``.
+ Equality compaisons between ``np.nan``, :class:`NaT`, and :class:`NA`
+ do not act like ``None``
.. ipython:: python
None == None # noqa: E711
np.nan == np.nan
+ pd.NaT == pd.NaT
+ pd.NA == pd.NA
- So as compared to above, a scalar equality comparison versus a ``None/np.nan`` doesn't provide useful information.
+ Therefore, an equality comparison between a :class:`DataFrame` or :class:`Series`
+ with one of these missing values does not provide the same information as
+ :func:`isna` or :func:`notna`.
.. ipython:: python
- df2["one"] == np.nan
+ ser = pd.Series([True, None], dtype="boolean[pyarrow]")
+ ser == pd.NA
+ pd.isna(ser)
+
-Integer dtypes and missing data
--------------------------------
+.. _missing_data.NA:
+
+:class:`NA` semantics
+~~~~~~~~~~~~~~~~~~~~~
+
+.. warning::
-Because ``NaN`` is a float, a column of integers with even one missing values
-is cast to floating-point dtype (see :ref:`gotchas.intna` for more). pandas
-provides a nullable integer array, which can be used by explicitly requesting
-the dtype:
+ Experimental: the behaviour of :class:`NA`` can still change without warning.
+
+Starting from pandas 1.0, an experimental :class:`NA` value (singleton) is
+available to represent scalar missing values. The goal of :class:`NA` is provide a
+"missing" indicator that can be used consistently across data types
+(instead of ``np.nan``, ``None`` or ``pd.NaT`` depending on the data type).
+
+For example, when having missing values in a :class:`Series` with the nullable integer
+dtype, it will use :class:`NA`:
.. ipython:: python
- pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())
+ s = pd.Series([1, 2, None], dtype="Int64")
+ s
+ s[2]
+ s[2] is pd.NA
-Alternatively, the string alias ``dtype='Int64'`` (note the capital ``"I"``) can be
-used.
+Currently, pandas does not yet use those data types using :class:`NA` by default
+a :class:`DataFrame` or :class:`Series`, so you need to specify
+the dtype explicitly. An easy way to convert to those dtypes is explained in the
+:ref:`conversion section `.
-See :ref:`integer_na` for more.
+Propagation in arithmetic and comparison operations
+---------------------------------------------------
-Datetimes
----------
-.. note::
- If you are adding type checking to your application, you may need access to ``NaTType`` and ``NAType``.
+In general, missing values *propagate* in operations involving :class:`NA`. When
+one of the operands is unknown, the outcome of the operation is also unknown.
-For datetime64[ns] types, ``NaT`` represents missing values. This is a pseudo-native
-sentinel value that can be represented by NumPy in a singular dtype (datetime64[ns]).
-pandas objects provide compatibility between ``NaT`` and ``NaN``.
+For example, :class:`NA` propagates in arithmetic operations, similarly to
+``np.nan``:
.. ipython:: python
- df2 = df.copy()
- df2["timestamp"] = pd.Timestamp("20120101")
- df2
- df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan
- df2
- df2.dtypes.value_counts()
+ pd.NA + 1
+ "a" * pd.NA
-.. _missing.inserting:
+There are a few special cases when the result is known, even when one of the
+operands is ``NA``.
-Inserting missing data
-~~~~~~~~~~~~~~~~~~~~~~
+.. ipython:: python
-You can insert missing values by simply assigning to containers. The
-actual missing value used will be chosen based on the dtype.
+ pd.NA ** 0
+ 1 ** pd.NA
-For example, numeric containers will always use ``NaN`` regardless of
-the missing value type chosen:
+In equality and comparison operations, :class:`NA` also propagates. This deviates
+from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always
+return ``False``.
.. ipython:: python
- s = pd.Series([1., 2., 3.])
- s.loc[0] = None
- s
-
-Likewise, datetime containers will always use ``NaT``.
+ pd.NA == 1
+ pd.NA == pd.NA
+ pd.NA < 2.5
-For object containers, pandas will use the value given:
+To check if a value is equal to :class:`NA`, use :func:`isna`
.. ipython:: python
- s = pd.Series(["a", "b", "c"])
- s.loc[0] = None
- s.loc[1] = np.nan
- s
+ pd.isna(pd.NA)
-.. _missing_data.calculations:
-Calculations with missing data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. note::
+
+ An exception on this basic propagation rule are *reductions* (such as the
+ mean or the minimum), where pandas defaults to skipping missing values. See the
+ :ref:`calculation section ` for more.
+
+Logical operations
+------------------
+
+For logical operations, :class:`NA` follows the rules of the
+`three-valued logic `__ (or
+*Kleene logic*, similarly to R, SQL and Julia). This logic means to only
+propagate missing values when it is logically required.
-Missing values propagate naturally through arithmetic operations between pandas
-objects.
+For example, for the logical "or" operation (``|``), if one of the operands
+is ``True``, we already know the result will be ``True``, regardless of the
+other value (so regardless the missing value would be ``True`` or ``False``).
+In this case, :class:`NA` does not propagate:
.. ipython:: python
- df = df2.loc[:, ["one", "two", "three"]]
- a = df2.loc[df2.index[:5], ["one", "two"]].ffill()
- b = df2.loc[df2.index[:5], ["one", "two", "three"]]
- a
- b
- a + b
+ True | False
+ True | pd.NA
+ pd.NA | True
-The descriptive statistics and computational methods discussed in the
-:ref:`data structure overview ` (and listed :ref:`here
-` and :ref:`here `) are all written to
-account for missing data. For example:
+On the other hand, if one of the operands is ``False``, the result depends
+on the value of the other operand. Therefore, in this case :class:`NA`
+propagates:
-* When summing data, NA (missing) values will be treated as zero.
-* If the data are all NA, the result will be 0.
-* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``.
+.. ipython:: python
+
+ False | True
+ False | False
+ False | pd.NA
+
+The behaviour of the logical "and" operation (``&``) can be derived using
+similar logic (where now :class:`NA` will not propagate if one of the operands
+is already ``False``):
.. ipython:: python
- df
- df["one"].sum()
- df.mean(1)
- df.cumsum()
- df.cumsum(skipna=False)
+ False & True
+ False & False
+ False & pd.NA
+.. ipython:: python
-.. _missing_data.numeric_sum:
+ True & True
+ True & False
+ True & pd.NA
-Sum/prod of empties/nans
-~~~~~~~~~~~~~~~~~~~~~~~~
-The sum of an empty or all-NA Series or column of a DataFrame is 0.
+``NA`` in a boolean context
+---------------------------
+
+Since the actual value of an NA is unknown, it is ambiguous to convert NA
+to a boolean value.
.. ipython:: python
+ :okexcept:
- pd.Series([np.nan]).sum()
+ bool(pd.NA)
- pd.Series([], dtype="float64").sum()
+This also means that :class:`NA` cannot be used in a context where it is
+evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can
+potentially be :class:`NA`. In such cases, :func:`isna` can be used to check
+for :class:`NA` or ``condition`` being :class:`NA` can be avoided, for example by
+filling missing values beforehand.
+
+A similar situation occurs when using :class:`Series` or :class:`DataFrame` objects in ``if``
+statements, see :ref:`gotchas.truth`.
+
+NumPy ufuncs
+------------
-The product of an empty or all-NA Series or column of a DataFrame is 1.
+:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs
+work with ``NA``, and generally return ``NA``:
.. ipython:: python
- pd.Series([np.nan]).prod()
+ np.log(pd.NA)
+ np.add(pd.NA, 1)
- pd.Series([], dtype="float64").prod()
+.. warning::
+ Currently, ufuncs involving an ndarray and ``NA`` will return an
+ object-dtype filled with NA values.
-NA values in GroupBy
-~~~~~~~~~~~~~~~~~~~~
+ .. ipython:: python
+
+ a = np.array([1, 2, 3])
+ np.greater(a, pd.NA)
+
+ The return type here may change to return a different array type
+ in the future.
+
+See :ref:`dsintro.numpy_interop` for more on ufuncs.
-NA groups in GroupBy are automatically excluded. This behavior is consistent
-with R, for example:
+.. _missing_data.NA.conversion:
+
+Conversion
+^^^^^^^^^^
+
+If you have a :class:`DataFrame` or :class:`Series` using ``np.nan``,
+:meth:`Series.convert_dtypes` and :meth:`DataFrame.convert_dtypes`
+in :class:`DataFrame` that can convert data to use the data types that use :class:`NA`
+such as :class:`Int64Dtype` or :class:`ArrowDtype`. This is especially helpful after reading
+in data sets from IO methods where data types were inferred.
+
+In this example, while the dtypes of all columns are changed, we show the results for
+the first 10 columns.
.. ipython:: python
- df
- df.groupby("one").mean()
+ import io
+ data = io.StringIO("a,b\n,True\n2,")
+ df = pd.read_csv(data)
+ df.dtypes
+ df_conv = df.convert_dtypes()
+ df_conv
+ df_conv.dtypes
-See the groupby section :ref:`here ` for more information.
+.. _missing.inserting:
-Cleaning / filling missing data
---------------------------------
+Inserting missing data
+~~~~~~~~~~~~~~~~~~~~~~
-pandas objects are equipped with various data manipulation methods for dealing
-with missing data.
+You can insert missing values by simply assigning to a :class:`Series` or :class:`DataFrame`.
+The missing value sentinel used will be chosen based on the dtype.
-.. _missing_data.fillna:
+.. ipython:: python
+
+ ser = pd.Series([1., 2., 3.])
+ ser.loc[0] = None
+ ser
+
+ ser = pd.Series([pd.Timestamp("2021"), pd.Timestamp("2021")])
+ ser.iloc[0] = np.nan
+ ser
+
+ ser = pd.Series([True, False], dtype="boolean[pyarrow]")
+ ser.iloc[0] = None
+ ser
+
+For ``object`` types, pandas will use the value given:
+
+.. ipython:: python
-Filling missing values: fillna
+ s = pd.Series(["a", "b", "c"], dtype=object)
+ s.loc[0] = None
+ s.loc[1] = np.nan
+ s
+
+.. _missing_data.calculations:
+
+Calculations with missing data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-:meth:`~DataFrame.fillna` can "fill in" NA values with non-NA data in a couple
-of ways, which we illustrate:
+Missing values propagate through arithmetic operations between pandas objects.
+
+.. ipython:: python
+
+ ser1 = pd.Series([np.nan, np.nan, 2, 3])
+ ser2 = pd.Series([np.nan, 1, np.nan, 4])
+ ser1
+ ser2
+ ser1 + ser2
+
+The descriptive statistics and computational methods discussed in the
+:ref:`data structure overview ` (and listed :ref:`here
+` and :ref:`here `) are all
+account for missing data.
+
+When summing data, NA values or empty data will be treated as zero.
-**Replace NA with a scalar value**
+.. ipython:: python
+
+ pd.Series([np.nan]).sum()
+ pd.Series([], dtype="float64").sum()
+
+When taking the product, NA values or empty data will be treated as 1.
.. ipython:: python
- df2
- df2.fillna(0)
- df2["one"].fillna("missing")
+ pd.Series([np.nan]).prod()
+ pd.Series([], dtype="float64").prod()
+
+Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod`
+ignore NA values by default preserve them in the result. This behavior can be changed
+with ``skipna``
-**Fill gaps forward or backward**
+* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``.
-Using the same filling arguments as :ref:`reindexing `, we
-can propagate non-NA values forward or backward:
.. ipython:: python
- df
- df.ffill()
+ ser = pd.Series([1, np.nan, 3, np.nan])
+ ser
+ ser.cumsum()
+ ser.cumsum(skipna=False)
-.. _missing_data.fillna.limit:
+.. _missing_data.dropna:
-**Limit the amount of filling**
+Dropping missing data
+~~~~~~~~~~~~~~~~~~~~~
-If we only want consecutive gaps filled up to a certain number of data points,
-we can use the ``limit`` keyword:
+:meth:`~DataFrame.dropna` dropa rows or columns with missing data.
.. ipython:: python
- df.iloc[2:4, :] = np.nan
+ df = pd.DataFrame([[np.nan, 1, 2], [1, 2, np.nan], [1, 2, 3]])
df
- df.ffill(limit=1)
+ df.dropna()
+ df.dropna(axis=1)
-To remind you, these are the available filling methods:
+ ser = pd.Series([1, pd.NA], dtype="int64[pyarrow]")
+ ser.dropna()
-.. csv-table::
- :header: "Method", "Action"
- :widths: 30, 50
+Filling missing data
+~~~~~~~~~~~~~~~~~~~~
- pad / ffill, Fill values forward
- bfill / backfill, Fill values backward
+.. _missing_data.fillna:
-With time series data, using pad/ffill is extremely common so that the "last
-known value" is available at every time point.
+Filling by value
+----------------
-:meth:`~DataFrame.ffill` is equivalent to ``fillna(method='ffill')``
-and :meth:`~DataFrame.bfill` is equivalent to ``fillna(method='bfill')``
+:meth:`~DataFrame.fillna` replaces NA values with non-NA data.
-.. _missing_data.PandasObject:
+Replace NA with a scalar value
-Filling with a PandasObject
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. ipython:: python
-You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series
-must match the columns of the frame you wish to fill. The
-use case of this is to fill a DataFrame with the mean of that column.
+ data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")}
+ df = pd.DataFrame(data)
+ df
+ df.fillna(0)
+
+Fill gaps forward or backward
.. ipython:: python
- dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC"))
- dff.iloc[3:5, 0] = np.nan
- dff.iloc[4:6, 1] = np.nan
- dff.iloc[5:8, 2] = np.nan
- dff
+ df.ffill()
+ df.bfill()
- dff.fillna(dff.mean())
- dff.fillna(dff.mean()["B":"C"])
+.. _missing_data.fillna.limit:
-Same result as above, but is aligning the 'fill' value which is
-a Series in this case.
+Limit the number of NA values filled
.. ipython:: python
- dff.where(pd.notna(dff), dff.mean(), axis="columns")
+ df.ffill(limit=1)
+NA values can be replaced with corresponding value from a :class:`Series` or :class:`DataFrame`
+where the index and column aligns between the original object and the filled object.
-.. _missing_data.dropna:
+.. ipython:: python
-Dropping axis labels with missing data: dropna
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ dff = pd.DataFrame(np.arange(30, dtype=np.float64).reshape(10, 3), columns=list("ABC"))
+ dff.iloc[3:5, 0] = np.nan
+ dff.iloc[4:6, 1] = np.nan
+ dff.iloc[5:8, 2] = np.nan
+ dff
+ dff.fillna(dff.mean())
-You may wish to simply exclude labels from a data set which refer to missing
-data. To do this, use :meth:`~DataFrame.dropna`:
+.. note::
-.. ipython:: python
+ :meth:`DataFrame.where` can also be used to fill NA values.Same result as above.
- df["two"] = df["two"].fillna(0)
- df["three"] = df["three"].fillna(0)
- df
- df.dropna(axis=0)
- df.dropna(axis=1)
- df["one"].dropna()
+ .. ipython:: python
+
+ dff.where(pd.notna(dff), dff.mean(), axis="columns")
-An equivalent :meth:`~Series.dropna` is available for Series.
-DataFrame.dropna has considerably more options than Series.dropna, which can be
-examined :ref:`in the API `.
.. _missing_data.interpolate:
Interpolation
-~~~~~~~~~~~~~
+-------------
-Both Series and DataFrame objects have :meth:`~DataFrame.interpolate`
-that, by default, performs linear interpolation at missing data points.
+:meth:`DataFrame.interpolate` and :meth:`Series.interpolate` fills NA values
+using various interpolation methods.
.. ipython:: python
- np.random.seed(123456)
- idx = pd.date_range("1/1/2000", periods=100, freq="BM")
- ts = pd.Series(np.random.randn(100), index=idx)
- ts[1:5] = np.nan
- ts[20:30] = np.nan
- ts[60:80] = np.nan
- ts = ts.cumsum()
+ df = pd.DataFrame(
+ {
+ "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8],
+ "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4],
+ }
+ )
+ df
+ df.interpolate()
+
+ idx = pd.date_range("2020-01-01", periods=10, freq="D")
+ data = np.random.default_rng(2).integers(0, 10, 10).astype(np.float64)
+ ts = pd.Series(data, index=idx)
+ ts.iloc[[1, 2, 5, 6, 9]] = np.nan
ts
- ts.count()
@savefig series_before_interpolate.png
ts.plot()
.. ipython:: python
ts.interpolate()
- ts.interpolate().count()
-
@savefig series_interpolate.png
ts.interpolate().plot()
-Index aware interpolation is available via the ``method`` keyword:
+Interpolation relative to a :class:`Timestamp` in the :class:`DatetimeIndex`
+is available by setting ``method="time"``
.. ipython:: python
- ts2 = ts.iloc[[0, 1, 30, 60, 99]]
+ ts2 = ts.iloc[[0, 1, 3, 7, 9]]
ts2
ts2.interpolate()
ts2.interpolate(method="time")
@@ -360,46 +472,36 @@ For a floating-point index, use ``method='values'``:
idx = [0.0, 1.0, 10.0]
ser = pd.Series([0.0, np.nan, 10.0], idx)
-
ser
ser.interpolate()
ser.interpolate(method="values")
-You can also interpolate with a DataFrame:
-
-.. ipython:: python
-
- df = pd.DataFrame(
- {
- "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8],
- "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4],
- }
- )
- df
- df.interpolate()
-
-The ``method`` argument gives access to fancier interpolation methods.
If you have scipy_ installed, you can pass the name of a 1-d interpolation routine to ``method``.
-You'll want to consult the full scipy interpolation documentation_ and reference guide_ for details.
-The appropriate interpolation method will depend on the type of data you are working with.
+as specified in the scipy interpolation documentation_ and reference guide_.
+The appropriate interpolation method will depend on the data type.
-* If you are dealing with a time series that is growing at an increasing rate,
- ``method='quadratic'`` may be appropriate.
-* If you have values approximating a cumulative distribution function,
- then ``method='pchip'`` should work well.
-* To fill missing values with goal of smooth plotting, consider ``method='akima'``.
+.. tip::
-.. warning::
+ If you are dealing with a time series that is growing at an increasing rate,
+ use ``method='barycentric'``.
- These methods require ``scipy``.
+ If you have values approximating a cumulative distribution function,
+ use ``method='pchip'``.
-.. ipython:: python
+ To fill missing values with goal of smooth plotting use ``method='akima'``.
- df.interpolate(method="barycentric")
-
- df.interpolate(method="pchip")
+ .. ipython:: python
- df.interpolate(method="akima")
+ df = pd.DataFrame(
+ {
+ "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8],
+ "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4],
+ }
+ )
+ df
+ df.interpolate(method="barycentric")
+ df.interpolate(method="pchip")
+ df.interpolate(method="akima")
When interpolating via a polynomial or spline approximation, you must also specify
the degree or order of the approximation:
@@ -407,10 +509,9 @@ the degree or order of the approximation:
.. ipython:: python
df.interpolate(method="spline", order=2)
-
df.interpolate(method="polynomial", order=2)
-Compare several methods:
+Comparing several methods.
.. ipython:: python
@@ -425,11 +526,7 @@ Compare several methods:
@savefig compare_interpolations.png
df.plot()
-Another use case is interpolation at *new* values.
-Suppose you have 100 observations from some distribution. And let's suppose
-that you're particularly interested in what's happening around the middle.
-You can mix pandas' ``reindex`` and ``interpolate`` methods to interpolate
-at the new values.
+Interpolating new observations from expanding data with :meth:`Series.reindex`.
.. ipython:: python
@@ -447,21 +544,17 @@ at the new values.
.. _missing_data.interp_limits:
Interpolation limits
---------------------
+^^^^^^^^^^^^^^^^^^^^
-Like other pandas fill methods, :meth:`~DataFrame.interpolate` accepts a ``limit`` keyword
-argument. Use this argument to limit the number of consecutive ``NaN`` values
-filled since the last valid observation:
+:meth:`~DataFrame.interpolate` accepts a ``limit`` keyword
+argument to limit the number of consecutive ``NaN`` values
+filled since the last valid observation
.. ipython:: python
ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan])
ser
-
- # fill all consecutive values in a forward direction
ser.interpolate()
-
- # fill one consecutive value in a forward direction
ser.interpolate(limit=1)
By default, ``NaN`` values are filled in a ``forward`` direction. Use
@@ -469,17 +562,12 @@ By default, ``NaN`` values are filled in a ``forward`` direction. Use
.. ipython:: python
- # fill one consecutive value backwards
ser.interpolate(limit=1, limit_direction="backward")
-
- # fill one consecutive value in both directions
ser.interpolate(limit=1, limit_direction="both")
-
- # fill all consecutive values in both directions
ser.interpolate(limit_direction="both")
-By default, ``NaN`` values are filled whether they are inside (surrounded by)
-existing valid values, or outside existing valid values. The ``limit_area``
+By default, ``NaN`` values are filled whether they are surrounded by
+existing valid values or outside existing valid values. The ``limit_area``
parameter restricts filling to either inside or outside values.
.. ipython:: python
@@ -495,58 +583,46 @@ parameter restricts filling to either inside or outside values.
.. _missing_data.replace:
-Replacing generic values
-~~~~~~~~~~~~~~~~~~~~~~~~
-Often times we want to replace arbitrary values with other values.
-
-:meth:`~Series.replace` in Series and :meth:`~DataFrame.replace` in DataFrame provides an efficient yet
-flexible way to perform such replacements.
-
-For a Series, you can replace a single value or a list of values by another
-value:
-
-.. ipython:: python
-
- ser = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0])
+Replacing values
+----------------
- ser.replace(0, 5)
-
-You can replace a list of values by a list of other values:
+:meth:`Series.replace` and :meth:`DataFrame.replace` can be used similar to
+:meth:`Series.fillna` and :meth:`DataFrame.fillna` to replace or insert missing values.
.. ipython:: python
- ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])
+ df = pd.DataFrame(np.eye(3))
+ df
+ df_missing = df.replace(0, np.nan)
+ df_missing
+ df_filled = df_missing.replace(np.nan, 2)
+ df_filled
-You can also specify a mapping dict:
+Replacing more than one value is possible by passing a list.
.. ipython:: python
- ser.replace({0: 10, 1: 100})
+ df_filled.replace([1, 44], [2, 28])
-For a DataFrame, you can specify individual values by column:
+Replacing using a mapping dict.
.. ipython:: python
- df = pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]})
-
- df.replace({"a": 0, "b": 5}, 100)
+ df_filled.replace({1: 44, 2: 28})
.. _missing_data.replace_expression:
-String/regular expression replacement
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Regular expression replacement
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. note::
Python strings prefixed with the ``r`` character such as ``r'hello world'``
- are so-called "raw" strings. They have different semantics regarding
- backslashes than strings without this prefix. Backslashes in raw strings
- will be interpreted as an escaped backslash, e.g., ``r'\' == '\\'``. You
- should `read about them
- `__
- if this is unclear.
+ are `"raw" strings `_.
+ They have different semantics regarding backslashes than strings without this prefix.
+ Backslashes in raw strings will be interpreted as an escaped backslash, e.g., ``r'\' == '\\'``.
-Replace the '.' with ``NaN`` (str -> str):
+Replace the '.' with ``NaN``
.. ipython:: python
@@ -554,349 +630,47 @@ Replace the '.' with ``NaN`` (str -> str):
df = pd.DataFrame(d)
df.replace(".", np.nan)
-Now do it with a regular expression that removes surrounding whitespace
-(regex -> regex):
+Replace the '.' with ``NaN`` with regular expression that removes surrounding whitespace
.. ipython:: python
df.replace(r"\s*\.\s*", np.nan, regex=True)
-Replace a few different values (list -> list):
-
-.. ipython:: python
-
- df.replace(["a", "."], ["b", np.nan])
-
-list of regex -> list of regex:
+Replace with a list of regexes.
.. ipython:: python
df.replace([r"\.", r"(a)"], ["dot", r"\1stuff"], regex=True)
-Only search in column ``'b'`` (dict -> dict):
-
-.. ipython:: python
-
- df.replace({"b": "."}, {"b": np.nan})
-
-Same as the previous example, but use a regular expression for
-searching instead (dict of regex -> dict):
+Replace with a regex in a mapping dict.
.. ipython:: python
df.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True)
-You can pass nested dictionaries of regular expressions that use ``regex=True``:
+Pass nested dictionaries of regular expressions that use the ``regex`` keyword.
.. ipython:: python
df.replace({"b": {"b": r""}}, regex=True)
-
-Alternatively, you can pass the nested dictionary like so:
-
-.. ipython:: python
-
df.replace(regex={"b": {r"\s*\.\s*": np.nan}})
-
-You can also use the group of a regular expression match when replacing (dict
-of regex -> dict of regex), this works for lists as well.
-
-.. ipython:: python
-
df.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True)
-You can pass a list of regular expressions, of which those that match
-will be replaced with a scalar (list of regex -> regex).
+Pass a list of regular expressions that will replace matches with a scalar.
.. ipython:: python
- df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True)
+ df.replace([r"\s*\.\s*", r"a|b"], "placeholder", regex=True)
All of the regular expression examples can also be passed with the
``to_replace`` argument as the ``regex`` argument. In this case the ``value``
argument must be passed explicitly by name or ``regex`` must be a nested
-dictionary. The previous example, in this case, would then be:
+dictionary.
.. ipython:: python
- df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan)
-
-This can be convenient if you do not want to pass ``regex=True`` every time you
-want to use a regular expression.
+ df.replace(regex=[r"\s*\.\s*", r"a|b"], value="placeholder")
.. note::
- Anywhere in the above ``replace`` examples that you see a regular expression
- a compiled regular expression is valid as well.
-
-Numeric replacement
-~~~~~~~~~~~~~~~~~~~
-
-:meth:`~DataFrame.replace` is similar to :meth:`~DataFrame.fillna`.
-
-.. ipython:: python
-
- df = pd.DataFrame(np.random.randn(10, 2))
- df[np.random.rand(df.shape[0]) > 0.5] = 1.5
- df.replace(1.5, np.nan)
-
-Replacing more than one value is possible by passing a list.
-
-.. ipython:: python
-
- df00 = df.iloc[0, 0]
- df.replace([1.5, df00], [np.nan, "a"])
- df[1].dtype
-
-Missing data casting rules and indexing
----------------------------------------
-
-While pandas supports storing arrays of integer and boolean type, these types
-are not capable of storing missing data. Until we can switch to using a native
-NA type in NumPy, we've established some "casting rules". When a reindexing
-operation introduces missing data, the Series will be cast according to the
-rules introduced in the table below.
-
-.. csv-table::
- :header: "data type", "Cast to"
- :widths: 40, 40
-
- integer, float
- boolean, object
- float, no cast
- object, no cast
-
-For example:
-
-.. ipython:: python
-
- s = pd.Series(np.random.randn(5), index=[0, 2, 4, 6, 7])
- s > 0
- (s > 0).dtype
- crit = (s > 0).reindex(list(range(8)))
- crit
- crit.dtype
-
-Ordinarily NumPy will complain if you try to use an object array (even if it
-contains boolean values) instead of a boolean array to get or set values from
-an ndarray (e.g. selecting values based on some criteria). If a boolean vector
-contains NAs, an exception will be generated:
-
-.. ipython:: python
- :okexcept:
-
- reindexed = s.reindex(list(range(8))).fillna(0)
- reindexed[crit]
-
-However, these can be filled in using :meth:`~DataFrame.fillna` and it will work fine:
-
-.. ipython:: python
-
- reindexed[crit.fillna(False)]
- reindexed[crit.fillna(True)]
-
-pandas provides a nullable integer dtype, but you must explicitly request it
-when creating the series or column. Notice that we use a capital "I" in
-the ``dtype="Int64"``.
-
-.. ipython:: python
-
- s = pd.Series([0, 1, np.nan, 3, 4], dtype="Int64")
- s
-
-See :ref:`integer_na` for more.
-
-
-.. _missing_data.NA:
-
-Experimental ``NA`` scalar to denote missing values
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. warning::
-
- Experimental: the behaviour of ``pd.NA`` can still change without warning.
-
-Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is
-available to represent scalar missing values. At this moment, it is used in
-the nullable :doc:`integer `, boolean and
-:ref:`dedicated string ` data types as the missing value indicator.
-
-The goal of ``pd.NA`` is provide a "missing" indicator that can be used
-consistently across data types (instead of ``np.nan``, ``None`` or ``pd.NaT``
-depending on the data type).
-
-For example, when having missing values in a Series with the nullable integer
-dtype, it will use ``pd.NA``:
-
-.. ipython:: python
-
- s = pd.Series([1, 2, None], dtype="Int64")
- s
- s[2]
- s[2] is pd.NA
-
-Currently, pandas does not yet use those data types by default (when creating
-a DataFrame or Series, or when reading in data), so you need to specify
-the dtype explicitly. An easy way to convert to those dtypes is explained
-:ref:`here `.
-
-Propagation in arithmetic and comparison operations
----------------------------------------------------
-
-In general, missing values *propagate* in operations involving ``pd.NA``. When
-one of the operands is unknown, the outcome of the operation is also unknown.
-
-For example, ``pd.NA`` propagates in arithmetic operations, similarly to
-``np.nan``:
-
-.. ipython:: python
-
- pd.NA + 1
- "a" * pd.NA
-
-There are a few special cases when the result is known, even when one of the
-operands is ``NA``.
-
-.. ipython:: python
-
- pd.NA ** 0
- 1 ** pd.NA
-
-In equality and comparison operations, ``pd.NA`` also propagates. This deviates
-from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always
-return ``False``.
-
-.. ipython:: python
-
- pd.NA == 1
- pd.NA == pd.NA
- pd.NA < 2.5
-
-To check if a value is equal to ``pd.NA``, the :func:`isna` function can be
-used:
-
-.. ipython:: python
-
- pd.isna(pd.NA)
-
-An exception on this basic propagation rule are *reductions* (such as the
-mean or the minimum), where pandas defaults to skipping missing values. See
-:ref:`above ` for more.
-
-Logical operations
-------------------
-
-For logical operations, ``pd.NA`` follows the rules of the
-`three-valued logic `__ (or
-*Kleene logic*, similarly to R, SQL and Julia). This logic means to only
-propagate missing values when it is logically required.
-
-For example, for the logical "or" operation (``|``), if one of the operands
-is ``True``, we already know the result will be ``True``, regardless of the
-other value (so regardless the missing value would be ``True`` or ``False``).
-In this case, ``pd.NA`` does not propagate:
-
-.. ipython:: python
-
- True | False
- True | pd.NA
- pd.NA | True
-
-On the other hand, if one of the operands is ``False``, the result depends
-on the value of the other operand. Therefore, in this case ``pd.NA``
-propagates:
-
-.. ipython:: python
-
- False | True
- False | False
- False | pd.NA
-
-The behaviour of the logical "and" operation (``&``) can be derived using
-similar logic (where now ``pd.NA`` will not propagate if one of the operands
-is already ``False``):
-
-.. ipython:: python
-
- False & True
- False & False
- False & pd.NA
-
-.. ipython:: python
-
- True & True
- True & False
- True & pd.NA
-
-
-``NA`` in a boolean context
----------------------------
-
-Since the actual value of an NA is unknown, it is ambiguous to convert NA
-to a boolean value. The following raises an error:
-
-.. ipython:: python
- :okexcept:
-
- bool(pd.NA)
-
-This also means that ``pd.NA`` cannot be used in a context where it is
-evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can
-potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check
-for ``pd.NA`` or ``condition`` being ``pd.NA`` can be avoided, for example by
-filling missing values beforehand.
-
-A similar situation occurs when using Series or DataFrame objects in ``if``
-statements, see :ref:`gotchas.truth`.
-
-NumPy ufuncs
-------------
-
-:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs
-work with ``NA``, and generally return ``NA``:
-
-.. ipython:: python
-
- np.log(pd.NA)
- np.add(pd.NA, 1)
-
-.. warning::
-
- Currently, ufuncs involving an ndarray and ``NA`` will return an
- object-dtype filled with NA values.
-
- .. ipython:: python
-
- a = np.array([1, 2, 3])
- np.greater(a, pd.NA)
-
- The return type here may change to return a different array type
- in the future.
-
-See :ref:`dsintro.numpy_interop` for more on ufuncs.
-
-.. _missing_data.NA.conversion:
-
-Conversion
-----------
-
-If you have a DataFrame or Series using traditional types that have missing data
-represented using ``np.nan``, there are convenience methods
-:meth:`~Series.convert_dtypes` in Series and :meth:`~DataFrame.convert_dtypes`
-in DataFrame that can convert data to use the newer dtypes for integers, strings and
-booleans listed :ref:`here `. This is especially helpful after reading
-in data sets when letting the readers such as :meth:`read_csv` and :meth:`read_excel`
-infer default dtypes.
-
-In this example, while the dtypes of all columns are changed, we show the results for
-the first 10 columns.
-
-.. ipython:: python
-
- bb = pd.read_csv("data/baseball.csv", index_col="id")
- bb[bb.columns[:10]].dtypes
-
-.. ipython:: python
-
- bbn = bb.convert_dtypes()
- bbn[bbn.columns[:10]].dtypes
+ A regular expression object from ``re.compile`` is a valid input as well.
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
index fabe498cb3bdb..a35cfb396f1f6 100644
--- a/doc/source/user_guide/reshaping.rst
+++ b/doc/source/user_guide/reshaping.rst
@@ -136,7 +136,7 @@ Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For d
.. ipython:: python
- pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C")
+ pd.pivot_table(df, values="D", index=pd.Grouper(freq="ME", key="F"), columns="C")
.. _reshaping.pivot.margins:
@@ -480,7 +480,7 @@ The values can be cast to a different type using the ``dtype`` argument.
.. versionadded:: 1.5.0
-:func:`~pandas.from_dummies` coverts the output of :func:`~pandas.get_dummies` back into
+:func:`~pandas.from_dummies` converts the output of :func:`~pandas.get_dummies` back into
a :class:`Series` of categorical values from indicator values.
.. ipython:: python
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index bc49c7f958cb7..b262de5d71439 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -40,7 +40,7 @@ Suppose our raw dataset on disk has many columns.
return df
timeseries = [
- make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}")
+ make_timeseries(freq="1min", seed=i).rename(columns=lambda x: f"{x}_{i}")
for i in range(10)
]
ts_wide = pd.concat(timeseries, axis=1)
@@ -87,7 +87,7 @@ can store larger datasets in memory.
.. ipython:: python
:okwarning:
- ts = make_timeseries(freq="30S", seed=0)
+ ts = make_timeseries(freq="30s", seed=0)
ts.to_parquet("timeseries.parquet")
ts = pd.read_parquet("timeseries.parquet")
ts
@@ -173,7 +173,7 @@ files. Each file in the directory represents a different year of the entire data
pathlib.Path("data/timeseries").mkdir(exist_ok=True)
for i, (start, end) in enumerate(zip(starts, ends)):
- ts = make_timeseries(start=start, end=end, freq="1T", seed=i)
+ ts = make_timeseries(start=start, end=end, freq="1min", seed=i)
ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet")
diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst
index a6eb96f91a4bf..5daf204f39bcf 100644
--- a/doc/source/user_guide/timedeltas.rst
+++ b/doc/source/user_guide/timedeltas.rst
@@ -390,9 +390,9 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases `:
.. ipython:: python
- pd.date_range(start, periods=1000, freq="M")
+ pd.date_range(start, periods=1000, freq="ME")
pd.bdate_range(start, periods=250, freq="BQS")
@@ -461,7 +455,7 @@ of those specified will not be generated:
.. ipython:: python
- pd.date_range(start, end, freq="BM")
+ pd.date_range(start, end, freq="BME")
pd.date_range(start, end, freq="W")
@@ -557,7 +551,7 @@ intelligent functionality like selection, slicing, etc.
.. ipython:: python
- rng = pd.date_range(start, end, freq="BM")
+ rng = pd.date_range(start, end, freq="BME")
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.index
ts[:5].index
@@ -603,7 +597,7 @@ would include matching times on an included date:
dft = pd.DataFrame(
np.random.randn(100000, 1),
columns=["A"],
- index=pd.date_range("20130101", periods=100000, freq="T"),
+ index=pd.date_range("20130101", periods=100000, freq="min"),
)
dft
dft.loc["2013"]
@@ -641,7 +635,7 @@ We are stopping on the included end-point as it is part of the index:
np.random.randn(20, 1),
columns=["A"],
index=pd.MultiIndex.from_product(
- [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]]
+ [pd.date_range("20130101", periods=10, freq="12h"), ["a", "b"]]
),
)
dft2
@@ -882,34 +876,34 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ
:class:`~pandas.tseries.offsets.Week`, ``'W'``, "one week, optionally anchored on a day of the week"
:class:`~pandas.tseries.offsets.WeekOfMonth`, ``'WOM'``, "the x-th day of the y-th week of each month"
:class:`~pandas.tseries.offsets.LastWeekOfMonth`, ``'LWOM'``, "the x-th day of the last week of each month"
- :class:`~pandas.tseries.offsets.MonthEnd`, ``'M'``, "calendar month end"
+ :class:`~pandas.tseries.offsets.MonthEnd`, ``'ME'``, "calendar month end"
:class:`~pandas.tseries.offsets.MonthBegin`, ``'MS'``, "calendar month begin"
- :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BM'``, "business month end"
+ :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BME'``, "business month end"
:class:`~pandas.tseries.offsets.BMonthBegin` or :class:`~pandas.tseries.offsets.BusinessMonthBegin`, ``'BMS'``, "business month begin"
- :class:`~pandas.tseries.offsets.CBMonthEnd` or :class:`~pandas.tseries.offsets.CustomBusinessMonthEnd`, ``'CBM'``, "custom business month end"
+ :class:`~pandas.tseries.offsets.CBMonthEnd` or :class:`~pandas.tseries.offsets.CustomBusinessMonthEnd`, ``'CBME'``, "custom business month end"
:class:`~pandas.tseries.offsets.CBMonthBegin` or :class:`~pandas.tseries.offsets.CustomBusinessMonthBegin`, ``'CBMS'``, "custom business month begin"
- :class:`~pandas.tseries.offsets.SemiMonthEnd`, ``'SM'``, "15th (or other day_of_month) and calendar month end"
+ :class:`~pandas.tseries.offsets.SemiMonthEnd`, ``'SME'``, "15th (or other day_of_month) and calendar month end"
:class:`~pandas.tseries.offsets.SemiMonthBegin`, ``'SMS'``, "15th (or other day_of_month) and calendar month begin"
- :class:`~pandas.tseries.offsets.QuarterEnd`, ``'Q'``, "calendar quarter end"
+ :class:`~pandas.tseries.offsets.QuarterEnd`, ``'QE'``, "calendar quarter end"
:class:`~pandas.tseries.offsets.QuarterBegin`, ``'QS'``, "calendar quarter begin"
- :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQ``, "business quarter end"
+ :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQE``, "business quarter end"
:class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin"
:class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter"
- :class:`~pandas.tseries.offsets.YearEnd`, ``'A'``, "calendar year end"
- :class:`~pandas.tseries.offsets.YearBegin`, ``'AS'`` or ``'BYS'``,"calendar year begin"
- :class:`~pandas.tseries.offsets.BYearEnd`, ``'BA'``, "business year end"
- :class:`~pandas.tseries.offsets.BYearBegin`, ``'BAS'``, "business year begin"
+ :class:`~pandas.tseries.offsets.YearEnd`, ``'YE'``, "calendar year end"
+ :class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin"
+ :class:`~pandas.tseries.offsets.BYearEnd`, ``'BYE'``, "business year end"
+ :class:`~pandas.tseries.offsets.BYearBegin`, ``'BYS'``, "business year begin"
:class:`~pandas.tseries.offsets.FY5253`, ``'RE'``, "retail (aka 52-53 week) year"
:class:`~pandas.tseries.offsets.Easter`, None, "Easter holiday"
- :class:`~pandas.tseries.offsets.BusinessHour`, ``'BH'``, "business hour"
- :class:`~pandas.tseries.offsets.CustomBusinessHour`, ``'CBH'``, "custom business hour"
+ :class:`~pandas.tseries.offsets.BusinessHour`, ``'bh'``, "business hour"
+ :class:`~pandas.tseries.offsets.CustomBusinessHour`, ``'cbh'``, "custom business hour"
:class:`~pandas.tseries.offsets.Day`, ``'D'``, "one absolute day"
- :class:`~pandas.tseries.offsets.Hour`, ``'H'``, "one hour"
- :class:`~pandas.tseries.offsets.Minute`, ``'T'`` or ``'min'``,"one minute"
- :class:`~pandas.tseries.offsets.Second`, ``'S'``, "one second"
- :class:`~pandas.tseries.offsets.Milli`, ``'L'`` or ``'ms'``, "one millisecond"
- :class:`~pandas.tseries.offsets.Micro`, ``'U'`` or ``'us'``, "one microsecond"
- :class:`~pandas.tseries.offsets.Nano`, ``'N'``, "one nanosecond"
+ :class:`~pandas.tseries.offsets.Hour`, ``'h'``, "one hour"
+ :class:`~pandas.tseries.offsets.Minute`, ``'min'``,"one minute"
+ :class:`~pandas.tseries.offsets.Second`, ``'s'``, "one second"
+ :class:`~pandas.tseries.offsets.Milli`, ``'ms'``, "one millisecond"
+ :class:`~pandas.tseries.offsets.Micro`, ``'us'``, "one microsecond"
+ :class:`~pandas.tseries.offsets.Nano`, ``'ns'``, "one nanosecond"
``DateOffsets`` additionally have :meth:`rollforward` and :meth:`rollback`
methods for moving a date forward or backward respectively to a valid offset
@@ -1246,29 +1240,36 @@ frequencies. We will refer to these aliases as *offset aliases*.
"C", "custom business day frequency"
"D", "calendar day frequency"
"W", "weekly frequency"
- "M", "month end frequency"
- "SM", "semi-month end frequency (15th and end of month)"
- "BM", "business month end frequency"
- "CBM", "custom business month end frequency"
+ "ME", "month end frequency"
+ "SME", "semi-month end frequency (15th and end of month)"
+ "BME", "business month end frequency"
+ "CBME", "custom business month end frequency"
"MS", "month start frequency"
"SMS", "semi-month start frequency (1st and 15th)"
"BMS", "business month start frequency"
"CBMS", "custom business month start frequency"
- "Q", "quarter end frequency"
- "BQ", "business quarter end frequency"
+ "QE", "quarter end frequency"
+ "BQE", "business quarter end frequency"
"QS", "quarter start frequency"
"BQS", "business quarter start frequency"
- "A, Y", "year end frequency"
- "BA, BY", "business year end frequency"
- "AS, YS", "year start frequency"
- "BAS, BYS", "business year start frequency"
- "BH", "business hour frequency"
- "H", "hourly frequency"
- "T, min", "minutely frequency"
- "S", "secondly frequency"
- "L, ms", "milliseconds"
- "U, us", "microseconds"
- "N", "nanoseconds"
+ "YE", "year end frequency"
+ "BYE", "business year end frequency"
+ "YS", "year start frequency"
+ "BYS", "business year start frequency"
+ "h", "hourly frequency"
+ "bh", "business hour frequency"
+ "cbh", "custom business hour frequency"
+ "min", "minutely frequency"
+ "s", "secondly frequency"
+ "ms", "milliseconds"
+ "us", "microseconds"
+ "ns", "nanoseconds"
+
+.. deprecated:: 2.2.0
+
+ Aliases ``H``, ``BH``, ``CBH``, ``T``, ``S``, ``L``, ``U``, and ``N``
+ are deprecated in favour of the aliases ``h``, ``bh``, ``cbh``,
+ ``min``, ``s``, ``ms``, ``us``, and ``ns``.
.. note::
@@ -1316,13 +1317,18 @@ frequencies. We will refer to these aliases as *period aliases*.
"W", "weekly frequency"
"M", "monthly frequency"
"Q", "quarterly frequency"
- "A, Y", "yearly frequency"
- "H", "hourly frequency"
- "T, min", "minutely frequency"
- "S", "secondly frequency"
- "L, ms", "milliseconds"
- "U, us", "microseconds"
- "N", "nanoseconds"
+ "Y", "yearly frequency"
+ "h", "hourly frequency"
+ "min", "minutely frequency"
+ "s", "secondly frequency"
+ "ms", "milliseconds"
+ "us", "microseconds"
+ "ns", "nanoseconds"
+
+.. deprecated:: 2.2.0
+
+ Aliases ``A``, ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases
+ ``Y``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``.
Combining aliases
@@ -1343,7 +1349,7 @@ You can combine together day and intraday offsets:
pd.date_range(start, periods=10, freq="2h20min")
- pd.date_range(start, periods=10, freq="1D10U")
+ pd.date_range(start, periods=10, freq="1D10us")
Anchored offsets
~~~~~~~~~~~~~~~~
@@ -1361,30 +1367,30 @@ For some frequencies you can specify an anchoring suffix:
"W\-THU", "weekly frequency (Thursdays)"
"W\-FRI", "weekly frequency (Fridays)"
"W\-SAT", "weekly frequency (Saturdays)"
- "(B)Q(S)\-DEC", "quarterly frequency, year ends in December. Same as 'Q'"
- "(B)Q(S)\-JAN", "quarterly frequency, year ends in January"
- "(B)Q(S)\-FEB", "quarterly frequency, year ends in February"
- "(B)Q(S)\-MAR", "quarterly frequency, year ends in March"
- "(B)Q(S)\-APR", "quarterly frequency, year ends in April"
- "(B)Q(S)\-MAY", "quarterly frequency, year ends in May"
- "(B)Q(S)\-JUN", "quarterly frequency, year ends in June"
- "(B)Q(S)\-JUL", "quarterly frequency, year ends in July"
- "(B)Q(S)\-AUG", "quarterly frequency, year ends in August"
- "(B)Q(S)\-SEP", "quarterly frequency, year ends in September"
- "(B)Q(S)\-OCT", "quarterly frequency, year ends in October"
- "(B)Q(S)\-NOV", "quarterly frequency, year ends in November"
- "(B)A(S)\-DEC", "annual frequency, anchored end of December. Same as 'A'"
- "(B)A(S)\-JAN", "annual frequency, anchored end of January"
- "(B)A(S)\-FEB", "annual frequency, anchored end of February"
- "(B)A(S)\-MAR", "annual frequency, anchored end of March"
- "(B)A(S)\-APR", "annual frequency, anchored end of April"
- "(B)A(S)\-MAY", "annual frequency, anchored end of May"
- "(B)A(S)\-JUN", "annual frequency, anchored end of June"
- "(B)A(S)\-JUL", "annual frequency, anchored end of July"
- "(B)A(S)\-AUG", "annual frequency, anchored end of August"
- "(B)A(S)\-SEP", "annual frequency, anchored end of September"
- "(B)A(S)\-OCT", "annual frequency, anchored end of October"
- "(B)A(S)\-NOV", "annual frequency, anchored end of November"
+ "(B)Q(E)(S)\-DEC", "quarterly frequency, year ends in December. Same as 'QE'"
+ "(B)Q(E)(S)\-JAN", "quarterly frequency, year ends in January"
+ "(B)Q(E)(S)\-FEB", "quarterly frequency, year ends in February"
+ "(B)Q(E)(S)\-MAR", "quarterly frequency, year ends in March"
+ "(B)Q(E)(S)\-APR", "quarterly frequency, year ends in April"
+ "(B)Q(E)(S)\-MAY", "quarterly frequency, year ends in May"
+ "(B)Q(E)(S)\-JUN", "quarterly frequency, year ends in June"
+ "(B)Q(E)(S)\-JUL", "quarterly frequency, year ends in July"
+ "(B)Q(E)(S)\-AUG", "quarterly frequency, year ends in August"
+ "(B)Q(E)(S)\-SEP", "quarterly frequency, year ends in September"
+ "(B)Q(E)(S)\-OCT", "quarterly frequency, year ends in October"
+ "(B)Q(E)(S)\-NOV", "quarterly frequency, year ends in November"
+ "(B)Y(E)(S)\-DEC", "annual frequency, anchored end of December. Same as 'YE'"
+ "(B)Y(E)(S)\-JAN", "annual frequency, anchored end of January"
+ "(B)Y(E)(S)\-FEB", "annual frequency, anchored end of February"
+ "(B)Y(E)(S)\-MAR", "annual frequency, anchored end of March"
+ "(B)Y(E)(S)\-APR", "annual frequency, anchored end of April"
+ "(B)Y(E)(S)\-MAY", "annual frequency, anchored end of May"
+ "(B)Y(E)(S)\-JUN", "annual frequency, anchored end of June"
+ "(B)Y(E)(S)\-JUL", "annual frequency, anchored end of July"
+ "(B)Y(E)(S)\-AUG", "annual frequency, anchored end of August"
+ "(B)Y(E)(S)\-SEP", "annual frequency, anchored end of September"
+ "(B)Y(E)(S)\-OCT", "annual frequency, anchored end of October"
+ "(B)Y(E)(S)\-NOV", "annual frequency, anchored end of November"
These can be used as arguments to ``date_range``, ``bdate_range``, constructors
for ``DatetimeIndex``, as well as various other timeseries-related functions
@@ -1574,7 +1580,7 @@ rather than changing the alignment of the data and the index:
ts.shift(5, freq="D")
ts.shift(5, freq=pd.offsets.BDay())
- ts.shift(5, freq="BM")
+ ts.shift(5, freq="BME")
Note that with when ``freq`` is specified, the leading entry is no longer NaN
because the data is not being realigned.
@@ -1635,7 +1641,7 @@ Basics
.. ipython:: python
- rng = pd.date_range("1/1/2012", periods=100, freq="S")
+ rng = pd.date_range("1/1/2012", periods=100, freq="s")
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
@@ -1680,7 +1686,7 @@ the end of the interval.
.. warning::
The default values for ``label`` and ``closed`` is '**left**' for all
- frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W'
+ frequency offsets except for 'ME', 'YE', 'QE', 'BME', 'BYE', 'BQE', and 'W'
which all have a default of 'right'.
This might unintendedly lead to looking ahead, where the value for a later
@@ -1725,11 +1731,11 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to
# from secondly to every 250 milliseconds
- ts[:2].resample("250L").asfreq()
+ ts[:2].resample("250ms").asfreq()
- ts[:2].resample("250L").ffill()
+ ts[:2].resample("250ms").ffill()
- ts[:2].resample("250L").ffill(limit=2)
+ ts[:2].resample("250ms").ffill(limit=2)
Sparse resampling
~~~~~~~~~~~~~~~~~
@@ -1752,7 +1758,7 @@ If we want to resample to the full range of the series:
.. ipython:: python
- ts.resample("3T").sum()
+ ts.resample("3min").sum()
We can instead only resample those groups where we have points as follows:
@@ -1764,9 +1770,10 @@ We can instead only resample those groups where we have points as follows:
def round(t, freq):
# round a Timestamp to a specified freq
freq = to_offset(freq)
- return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value)
+ td = pd.Timedelta(freq)
+ return pd.Timestamp((t.value // td.value) * td.value)
- ts.groupby(partial(round, freq="3T")).sum()
+ ts.groupby(partial(round, freq="3min")).sum()
.. _timeseries.aggregate:
@@ -1783,10 +1790,10 @@ Resampling a ``DataFrame``, the default will be to act on all columns with the s
df = pd.DataFrame(
np.random.randn(1000, 3),
- index=pd.date_range("1/1/2012", freq="S", periods=1000),
+ index=pd.date_range("1/1/2012", freq="s", periods=1000),
columns=["A", "B", "C"],
)
- r = df.resample("3T")
+ r = df.resample("3min")
r.mean()
We can select a specific column or columns using standard getitem.
@@ -1846,7 +1853,7 @@ to resample based on datetimelike column in the frame, it can passed to the
),
)
df
- df.resample("M", on="date")[["a"]].sum()
+ df.resample("ME", on="date")[["a"]].sum()
Similarly, if you instead want to resample by a datetimelike
level of ``MultiIndex``, its name or location can be passed to the
@@ -1854,7 +1861,7 @@ level of ``MultiIndex``, its name or location can be passed to the
.. ipython:: python
- df.resample("M", level="d")[["a"]].sum()
+ df.resample("ME", level="d")[["a"]].sum()
.. _timeseries.iterating-label:
@@ -1879,7 +1886,7 @@ natural and functions similarly to :py:func:`itertools.groupby`:
]
),
)
- resampled = small.resample("H")
+ resampled = small.resample("h")
for name, group in resampled:
print("Group: ", name)
@@ -1985,20 +1992,20 @@ Because ``freq`` represents a span of ``Period``, it cannot be negative like "-3
.. ipython:: python
- pd.Period("2012", freq="A-DEC")
+ pd.Period("2012", freq="Y-DEC")
pd.Period("2012-1-1", freq="D")
- pd.Period("2012-1-1 19:00", freq="H")
+ pd.Period("2012-1-1 19:00", freq="h")
- pd.Period("2012-1-1 19:00", freq="5H")
+ pd.Period("2012-1-1 19:00", freq="5h")
Adding and subtracting integers from periods shifts the period by its own
frequency. Arithmetic is not allowed between ``Period`` with different ``freq`` (span).
.. ipython:: python
- p = pd.Period("2012", freq="A-DEC")
+ p = pd.Period("2012", freq="Y-DEC")
p + 1
p - 3
p = pd.Period("2012-01", freq="2M")
@@ -2007,11 +2014,11 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq``
p == pd.Period("2012-01", freq="3M")
-If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.
+If ``Period`` freq is daily or higher (``D``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.
.. ipython:: python
- p = pd.Period("2014-07-01 09:00", freq="H")
+ p = pd.Period("2014-07-01 09:00", freq="h")
p + pd.offsets.Hour(2)
p + datetime.timedelta(minutes=120)
p + np.timedelta64(7200, "s")
@@ -2040,7 +2047,7 @@ return the number of frequency units between them:
.. ipython:: python
- pd.Period("2012", freq="A-DEC") - pd.Period("2002", freq="A-DEC")
+ pd.Period("2012", freq="Y-DEC") - pd.Period("2002", freq="Y-DEC")
PeriodIndex and period_range
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2087,7 +2094,7 @@ objects:
.. ipython:: python
- idx = pd.period_range("2014-07-01 09:00", periods=5, freq="H")
+ idx = pd.period_range("2014-07-01 09:00", periods=5, freq="h")
idx
idx + pd.offsets.Hour(2)
@@ -2127,7 +2134,7 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th
pi.astype("datetime64[ns]")
# convert to PeriodIndex
- dti = pd.date_range("2011-01-01", freq="M", periods=3)
+ dti = pd.date_range("2011-01-01", freq="ME", periods=3)
dti
dti.astype("period[M]")
@@ -2155,16 +2162,16 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par
dfp = pd.DataFrame(
np.random.randn(600, 1),
columns=["A"],
- index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"),
+ index=pd.period_range("2013-01-01 9:00", periods=600, freq="min"),
)
dfp
- dfp.loc["2013-01-01 10H"]
+ dfp.loc["2013-01-01 10h"]
As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59.
.. ipython:: python
- dfp["2013-01-01 10H":"2013-01-01 11H"]
+ dfp["2013-01-01 10h":"2013-01-01 11h"]
Frequency conversion and resampling with PeriodIndex
@@ -2174,7 +2181,7 @@ method. Let's start with the fiscal year 2011, ending in December:
.. ipython:: python
- p = pd.Period("2011", freq="A-DEC")
+ p = pd.Period("2011", freq="Y-DEC")
p
We can convert it to a monthly frequency. Using the ``how`` parameter, we can
@@ -2201,10 +2208,10 @@ input period:
p = pd.Period("2011-12", freq="M")
- p.asfreq("A-NOV")
+ p.asfreq("Y-NOV")
Note that since we converted to an annual frequency that ends the year in
-November, the monthly period of December 2011 is actually in the 2012 A-NOV
+November, the monthly period of December 2011 is actually in the 2012 Y-NOV
period.
.. _timeseries.quarterly:
@@ -2246,7 +2253,7 @@ and vice-versa using ``to_timestamp``:
.. ipython:: python
- rng = pd.date_range("1/1/2012", periods=5, freq="M")
+ rng = pd.date_range("1/1/2012", periods=5, freq="ME")
ts = pd.Series(np.random.randn(len(rng)), index=rng)
@@ -2276,7 +2283,7 @@ the quarter end:
ts = pd.Series(np.random.randn(len(prng)), prng)
- ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9
+ ts.index = (prng.asfreq("M", "e") + 1).asfreq("h", "s") + 9
ts.head()
@@ -2495,7 +2502,7 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None)
.. ipython:: python
- didx = pd.date_range(start="2014-08-01 09:00", freq="H", periods=3, tz="US/Eastern")
+ didx = pd.date_range(start="2014-08-01 09:00", freq="h", periods=3, tz="US/Eastern")
didx
didx.tz_localize(None)
didx.tz_convert(None)
@@ -2592,7 +2599,7 @@ can be controlled by the ``nonexistent`` argument. The following options are ava
.. ipython:: python
- dti = pd.date_range(start="2015-03-29 02:30:00", periods=3, freq="H")
+ dti = pd.date_range(start="2015-03-29 02:30:00", periods=3, freq="h")
# 2:30 is a nonexistent time
Localization of nonexistent times will raise an error by default.
@@ -2609,7 +2616,7 @@ Transform nonexistent times to ``NaT`` or shift the times.
dti
dti.tz_localize("Europe/Warsaw", nonexistent="shift_forward")
dti.tz_localize("Europe/Warsaw", nonexistent="shift_backward")
- dti.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta(1, unit="H"))
+ dti.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta(1, unit="h"))
dti.tz_localize("Europe/Warsaw", nonexistent="NaT")
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index fc225d0f44497..ec024f36d78b1 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -10,12 +10,24 @@ This is the list of changes to pandas between each release. For full details,
see the `commit logs `_. For install and
upgrade instructions, see :ref:`install`.
+Version 2.2
+-----------
+
+.. toctree::
+ :maxdepth: 2
+
+ v2.2.0
+
Version 2.1
-----------
.. toctree::
:maxdepth: 2
+ v2.1.4
+ v2.1.3
+ v2.1.2
+ v2.1.1
v2.1.0
Version 2.0
diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst
index 3425986a37743..be50c34d7d14c 100644
--- a/doc/source/whatsnew/v0.10.0.rst
+++ b/doc/source/whatsnew/v0.10.0.rst
@@ -180,19 +180,36 @@ labeled the aggregated group with the end of the interval: the next day).
DataFrame constructor with no columns specified. The v0.9.0 behavior (names
``X0``, ``X1``, ...) can be reproduced by specifying ``prefix='X'``:
-.. ipython:: python
- :okexcept:
+.. code-block:: ipython
+
+ In [6]: import io
+
+ In [7]: data = """
+ ...: a,b,c
+ ...: 1,Yes,2
+ ...: 3,No,4
+ ...: """
+ ...:
+
+ In [8]: print(data)
+
+ a,b,c
+ 1,Yes,2
+ 3,No,4
- import io
+ In [9]: pd.read_csv(io.StringIO(data), header=None)
+ Out[9]:
+ 0 1 2
+ 0 a b c
+ 1 1 Yes 2
+ 2 3 No 4
- data = """
- a,b,c
- 1,Yes,2
- 3,No,4
- """
- print(data)
- pd.read_csv(io.StringIO(data), header=None)
- pd.read_csv(io.StringIO(data), header=None, prefix="X")
+ In [10]: pd.read_csv(io.StringIO(data), header=None, prefix="X")
+ Out[10]:
+ X0 X1 X2
+ 0 a b c
+ 1 1 Yes 2
+ 2 3 No 4
- Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default,
though this can be controlled by new ``true_values`` and ``false_values``
@@ -244,26 +261,15 @@ Convenience methods ``ffill`` and ``bfill`` have been added:
function, that is itself a series, and possibly upcast the result to a
DataFrame
- .. code-block:: python
-
- >>> def f(x):
- ... return pd.Series([x, x ** 2], index=["x", "x^2"])
- >>>
- >>> s = pd.Series(np.random.rand(5))
- >>> s
- 0 0.340445
- 1 0.984729
- 2 0.919540
- 3 0.037772
- 4 0.861549
- dtype: float64
- >>> s.apply(f)
- x x^2
- 0 0.340445 0.115903
- 1 0.984729 0.969691
- 2 0.919540 0.845555
- 3 0.037772 0.001427
- 4 0.861549 0.742267
+ .. ipython:: python
+
+ def f(x):
+ return pd.Series([x, x ** 2], index=["x", "x^2"])
+
+
+ s = pd.Series(np.random.rand(5))
+ s
+ s.apply(f)
- New API functions for working with pandas options (:issue:`2097`):
diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst
index 33f83c272b23d..f05cbc7f07d7d 100644
--- a/doc/source/whatsnew/v0.11.0.rst
+++ b/doc/source/whatsnew/v0.11.0.rst
@@ -347,7 +347,7 @@ Enhancements
.. ipython:: python
df = pd.DataFrame({'A': range(5), 'B': range(5)})
- df.to_hdf('store.h5', 'table', append=True)
+ df.to_hdf('store.h5', key='table', append=True)
pd.read_hdf('store.h5', 'table', where=['index > 2'])
.. ipython:: python
@@ -367,15 +367,27 @@ Enhancements
- You can now select with a string from a DataFrame with a datelike index, in a similar way to a Series (:issue:`3070`)
- .. ipython:: python
- :okexcept:
+ .. code-block:: ipython
- idx = pd.date_range("2001-10-1", periods=5, freq='M')
- ts = pd.Series(np.random.rand(len(idx)), index=idx)
- ts['2001']
+ In [30]: idx = pd.date_range("2001-10-1", periods=5, freq='M')
- df = pd.DataFrame({'A': ts})
- df['2001']
+ In [31]: ts = pd.Series(np.random.rand(len(idx)), index=idx)
+
+ In [32]: ts['2001']
+ Out[32]:
+ 2001-10-31 0.117967
+ 2001-11-30 0.702184
+ 2001-12-31 0.414034
+ Freq: M, dtype: float64
+
+ In [33]: df = pd.DataFrame({'A': ts})
+
+ In [34]: df['2001']
+ Out[34]:
+ A
+ 2001-10-31 0.117967
+ 2001-11-30 0.702184
+ 2001-12-31 0.414034
- ``Squeeze`` to possibly remove length 1 dimensions from an object.
diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index 091efe1b421c7..59d104cb3e96c 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -250,9 +250,9 @@ IO enhancements
.. ipython:: python
- from pandas._testing import makeCustomDataframe as mkdf
-
- df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
+ mi_idx = pd.MultiIndex.from_arrays([[1, 2, 3, 4], list("abcd")], names=list("ab"))
+ mi_col = pd.MultiIndex.from_arrays([[1, 2], list("ab")], names=list("cd"))
+ df = pd.DataFrame(np.ones((4, 2)), index=mi_idx, columns=mi_col)
df.to_csv("mi.csv")
print(open("mi.csv").read())
pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1])
diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst
index bfabfb1a27e73..f2e29121760ab 100644
--- a/doc/source/whatsnew/v0.13.0.rst
+++ b/doc/source/whatsnew/v0.13.0.rst
@@ -385,7 +385,7 @@ HDFStore API changes
dfq = pd.DataFrame(np.random.randn(10, 4),
columns=list('ABCD'),
index=pd.date_range('20130101', periods=10))
- dfq.to_hdf(path, 'dfq', format='table', data_columns=True)
+ dfq.to_hdf(path, key='dfq', format='table', data_columns=True)
Use boolean expressions, with in-line function evaluation.
@@ -415,9 +415,9 @@ HDFStore API changes
path = 'test.h5'
df = pd.DataFrame(np.random.randn(10, 2))
- df.to_hdf(path, 'df_table', format='table')
- df.to_hdf(path, 'df_table2', append=True)
- df.to_hdf(path, 'df_fixed')
+ df.to_hdf(path, key='df_table', format='table')
+ df.to_hdf(path, key='df_table2', append=True)
+ df.to_hdf(path, key='df_fixed')
with pd.HDFStore(path) as store:
print(store)
@@ -537,7 +537,6 @@ Enhancements
is frequency conversion. See :ref:`the docs` for the docs.
.. ipython:: python
- :okexcept:
import datetime
td = pd.Series(pd.date_range('20130101', periods=4)) - pd.Series(
@@ -546,13 +545,41 @@ Enhancements
td[3] = np.nan
td
+ .. code-block:: ipython
+
# to days
- td / np.timedelta64(1, 'D')
- td.astype('timedelta64[D]')
+ In [63]: td / np.timedelta64(1, 'D')
+ Out[63]:
+ 0 31.000000
+ 1 31.000000
+ 2 31.003507
+ 3 NaN
+ dtype: float64
+
+ In [64]: td.astype('timedelta64[D]')
+ Out[64]:
+ 0 31.0
+ 1 31.0
+ 2 31.0
+ 3 NaN
+ dtype: float64
# to seconds
- td / np.timedelta64(1, 's')
- td.astype('timedelta64[s]')
+ In [65]: td / np.timedelta64(1, 's')
+ Out[65]:
+ 0 2678400.0
+ 1 2678400.0
+ 2 2678703.0
+ 3 NaN
+ dtype: float64
+
+ In [66]: td.astype('timedelta64[s]')
+ Out[66]:
+ 0 2678400.0
+ 1 2678400.0
+ 2 2678703.0
+ 3 NaN
+ dtype: float64
Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series
@@ -642,9 +669,16 @@ Enhancements
Period conversions in the range of seconds and below were reworked and extended
up to nanoseconds. Periods in the nanosecond range are now available.
- .. ipython:: python
+ .. code-block:: python
- pd.date_range('2013-01-01', periods=5, freq='5N')
+ In [79]: pd.date_range('2013-01-01', periods=5, freq='5N')
+ Out[79]:
+ DatetimeIndex([ '2013-01-01 00:00:00',
+ '2013-01-01 00:00:00.000000005',
+ '2013-01-01 00:00:00.000000010',
+ '2013-01-01 00:00:00.000000015',
+ '2013-01-01 00:00:00.000000020'],
+ dtype='datetime64[ns]', freq='5N')
or with frequency as offset
diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst
index 6a9ade92a8b1d..8c85868e1aedb 100644
--- a/doc/source/whatsnew/v0.13.1.rst
+++ b/doc/source/whatsnew/v0.13.1.rst
@@ -29,11 +29,10 @@ Highlights include:
This would previously segfault:
- .. ipython:: python
+ .. code-block:: python
df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])})
df["A"].iloc[0] = np.nan
- df
The recommended way to do this type of assignment is:
diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
index 92c37243b7e81..05831197936eb 100644
--- a/doc/source/whatsnew/v0.14.0.rst
+++ b/doc/source/whatsnew/v0.14.0.rst
@@ -328,19 +328,36 @@ More consistent behavior for some groupby methods:
- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:
- .. ipython:: python
+ .. code-block:: ipython
- df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- g.head(1) # filters DataFrame
+ In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
- g.apply(lambda x: x.head(1)) # used to simply fall-through
+ In [2]: g = df.groupby('A')
+
+ In [3]: g.head(1) # filters DataFrame
+ Out[3]:
+ A B
+ 0 1 2
+ 2 5 6
+
+ In [4]: g.apply(lambda x: x.head(1)) # used to simply fall-through
+ Out[4]:
+ A B
+ A
+ 1 0 1 2
+ 5 2 5 6
- groupby head and tail respect column selection:
- .. ipython:: python
+ .. code-block:: ipython
+
+ In [19]: g[['B']].head(1)
+ Out[19]:
+ B
+ 0 2
+ 2 6
- g[['B']].head(1)
+ [2 rows x 1 columns]
- groupby ``nth`` now reduces by default; filtering can be achieved by passing ``as_index=False``. With an optional ``dropna`` argument to ignore
NaN. See :ref:`the docs `.
@@ -843,22 +860,61 @@ Enhancements
datetime.datetime(2013, 9, 5, 10, 0)]})
df
- df.pivot_table(values='Quantity',
- index=pd.Grouper(freq='M', key='Date'),
- columns=pd.Grouper(freq='M', key='PayDay'),
- aggfunc="sum")
+ .. code-block:: ipython
+
+ In [75]: df.pivot_table(values='Quantity',
+ ....: index=pd.Grouper(freq='M', key='Date'),
+ ....: columns=pd.Grouper(freq='M', key='PayDay'),
+ ....: aggfunc="sum")
+ Out[75]:
+ PayDay 2013-09-30 2013-10-31 2013-11-30
+ Date
+ 2013-09-30 NaN 3.0 NaN
+ 2013-10-31 6.0 NaN 1.0
+ 2013-11-30 NaN 9.0 NaN
+
+ [3 rows x 3 columns]
- Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`)
- Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`)
- ``PeriodIndex`` fully supports partial string indexing like ``DatetimeIndex`` (:issue:`7043`)
- .. ipython:: python
+ .. code-block:: ipython
- prng = pd.period_range('2013-01-01 09:00', periods=100, freq='H')
- ps = pd.Series(np.random.randn(len(prng)), index=prng)
- ps
- ps['2013-01-02']
+ In [76]: prng = pd.period_range('2013-01-01 09:00', periods=100, freq='H')
+
+ In [77]: ps = pd.Series(np.random.randn(len(prng)), index=prng)
+
+ In [78]: ps
+ Out[78]:
+ 2013-01-01 09:00 0.015696
+ 2013-01-01 10:00 -2.242685
+ 2013-01-01 11:00 1.150036
+ 2013-01-01 12:00 0.991946
+ 2013-01-01 13:00 0.953324
+ ...
+ 2013-01-05 08:00 0.285296
+ 2013-01-05 09:00 0.484288
+ 2013-01-05 10:00 1.363482
+ 2013-01-05 11:00 -0.781105
+ 2013-01-05 12:00 -0.468018
+ Freq: H, Length: 100, dtype: float64
+
+ In [79]: ps['2013-01-02']
+ Out[79]:
+ 2013-01-02 00:00 0.553439
+ 2013-01-02 01:00 1.318152
+ 2013-01-02 02:00 -0.469305
+ 2013-01-02 03:00 0.675554
+ 2013-01-02 04:00 -1.817027
+ ...
+ 2013-01-02 19:00 0.036142
+ 2013-01-02 20:00 -2.074978
+ 2013-01-02 21:00 0.247792
+ 2013-01-02 22:00 -0.897157
+ 2013-01-02 23:00 -0.136795
+ Freq: H, Length: 24, dtype: float64
- ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`)
- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`)
diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst
index 6b962cbb49c74..8dafed1efee97 100644
--- a/doc/source/whatsnew/v0.15.0.rst
+++ b/doc/source/whatsnew/v0.15.0.rst
@@ -185,7 +185,29 @@ Constructing a ``TimedeltaIndex`` with a regular range
.. ipython:: python
pd.timedelta_range('1 days', periods=5, freq='D')
- pd.timedelta_range(start='1 days', end='2 days', freq='30T')
+
+.. code-block:: python
+
+ In [20]: pd.timedelta_range(start='1 days', end='2 days', freq='30T')
+ Out[20]:
+ TimedeltaIndex(['1 days 00:00:00', '1 days 00:30:00', '1 days 01:00:00',
+ '1 days 01:30:00', '1 days 02:00:00', '1 days 02:30:00',
+ '1 days 03:00:00', '1 days 03:30:00', '1 days 04:00:00',
+ '1 days 04:30:00', '1 days 05:00:00', '1 days 05:30:00',
+ '1 days 06:00:00', '1 days 06:30:00', '1 days 07:00:00',
+ '1 days 07:30:00', '1 days 08:00:00', '1 days 08:30:00',
+ '1 days 09:00:00', '1 days 09:30:00', '1 days 10:00:00',
+ '1 days 10:30:00', '1 days 11:00:00', '1 days 11:30:00',
+ '1 days 12:00:00', '1 days 12:30:00', '1 days 13:00:00',
+ '1 days 13:30:00', '1 days 14:00:00', '1 days 14:30:00',
+ '1 days 15:00:00', '1 days 15:30:00', '1 days 16:00:00',
+ '1 days 16:30:00', '1 days 17:00:00', '1 days 17:30:00',
+ '1 days 18:00:00', '1 days 18:30:00', '1 days 19:00:00',
+ '1 days 19:30:00', '1 days 20:00:00', '1 days 20:30:00',
+ '1 days 21:00:00', '1 days 21:30:00', '1 days 22:00:00',
+ '1 days 22:30:00', '1 days 23:00:00', '1 days 23:30:00',
+ '2 days 00:00:00'],
+ dtype='timedelta64[ns]', freq='30T')
You can now use a ``TimedeltaIndex`` as the index of a pandas object
@@ -310,16 +332,37 @@ Timezone handling improvements
- ``tz_localize(None)`` for tz-aware ``Timestamp`` and ``DatetimeIndex`` now removes timezone holding local time,
previously this resulted in ``Exception`` or ``TypeError`` (:issue:`7812`)
- .. ipython:: python
+ .. code-block:: ipython
+
+ In [58]: ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern')
- ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern')
- ts
- ts.tz_localize(None)
+ In[59]: ts
+ Out[59]: Timestamp('2014-08-01 09:00:00-0400', tz='US/Eastern')
- didx = pd.date_range(start='2014-08-01 09:00', freq='H',
- periods=10, tz='US/Eastern')
- didx
- didx.tz_localize(None)
+ In [60]: ts.tz_localize(None)
+ Out[60]: Timestamp('2014-08-01 09:00:00')
+
+ In [61]: didx = pd.date_range(start='2014-08-01 09:00', freq='H',
+ ....: periods=10, tz='US/Eastern')
+ ....:
+
+ In [62]: didx
+ Out[62]:
+ DatetimeIndex(['2014-08-01 09:00:00-04:00', '2014-08-01 10:00:00-04:00',
+ '2014-08-01 11:00:00-04:00', '2014-08-01 12:00:00-04:00',
+ '2014-08-01 13:00:00-04:00', '2014-08-01 14:00:00-04:00',
+ '2014-08-01 15:00:00-04:00', '2014-08-01 16:00:00-04:00',
+ '2014-08-01 17:00:00-04:00', '2014-08-01 18:00:00-04:00'],
+ dtype='datetime64[ns, US/Eastern]', freq='H')
+
+ In [63]: didx.tz_localize(None)
+ Out[63]:
+ DatetimeIndex(['2014-08-01 09:00:00', '2014-08-01 10:00:00',
+ '2014-08-01 11:00:00', '2014-08-01 12:00:00',
+ '2014-08-01 13:00:00', '2014-08-01 14:00:00',
+ '2014-08-01 15:00:00', '2014-08-01 16:00:00',
+ '2014-08-01 17:00:00', '2014-08-01 18:00:00'],
+ dtype='datetime64[ns]', freq=None)
- ``tz_localize`` now accepts the ``ambiguous`` keyword which allows for passing an array of bools
indicating whether the date belongs in DST or not, 'NaT' for setting transition times to NaT,
@@ -1028,16 +1071,35 @@ Other:
If ``Period`` freq is ``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``, ``Timedelta``-like can be added if the result can have same freq. Otherwise, only the same ``offsets`` can be added.
- .. ipython:: python
+ .. code-block:: ipython
- idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H')
- idx
- idx + pd.offsets.Hour(2)
- idx + pd.Timedelta('120m')
+ In [104]: idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H')
- idx = pd.period_range('2014-07', periods=5, freq='M')
- idx
- idx + pd.offsets.MonthEnd(3)
+ In [105]: idx
+ Out[105]:
+ PeriodIndex(['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00',
+ '2014-07-01 12:00', '2014-07-01 13:00'],
+ dtype='period[H]')
+
+ In [106]: idx + pd.offsets.Hour(2)
+ Out[106]:
+ PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00',
+ '2014-07-01 14:00', '2014-07-01 15:00'],
+ dtype='period[H]')
+
+ In [107]: idx + pd.Timedelta('120m')
+ Out[107]:
+ PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00',
+ '2014-07-01 14:00', '2014-07-01 15:00'],
+ dtype='period[H]')
+
+ In [108]: idx = pd.period_range('2014-07', periods=5, freq='M')
+
+ In [109]: idx
+ Out[109]: PeriodIndex(['2014-07', '2014-08', '2014-09', '2014-10', '2014-11'], dtype='period[M]')
+
+ In [110]: idx + pd.offsets.MonthEnd(3)
+ Out[110]: PeriodIndex(['2014-10', '2014-11', '2014-12', '2015-01', '2015-02'], dtype='period[M]')
- Added experimental compatibility with ``openpyxl`` for versions >= 2.0. The ``DataFrame.to_excel``
method ``engine`` keyword now recognizes ``openpyxl1`` and ``openpyxl2``
diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst
index bb7beef449d93..acc5409b86d09 100644
--- a/doc/source/whatsnew/v0.15.2.rst
+++ b/doc/source/whatsnew/v0.15.2.rst
@@ -24,25 +24,61 @@ API changes
- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
a lexically sorted index will have a better performance. (:issue:`2646`)
- .. ipython:: python
- :okexcept:
- :okwarning:
+ .. code-block:: ipython
+
+ In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1],
+ ...: 'joe':['x', 'x', 'z', 'y'],
+ ...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
+ ...:
- df = pd.DataFrame({'jim':[0, 0, 1, 1],
- 'joe':['x', 'x', 'z', 'y'],
- 'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
- df
- df.index.lexsort_depth
+ In [2]: df
+ Out[2]:
+ jolie
+ jim joe
+ 0 x 0.126970
+ x 0.966718
+ 1 z 0.260476
+ y 0.897237
+
+ [4 rows x 1 columns]
+
+ In [3]: df.index.lexsort_depth
+ Out[3]: 1
# in prior versions this would raise a KeyError
# will now show a PerformanceWarning
- df.loc[(1, 'z')]
+ In [4]: df.loc[(1, 'z')]
+ Out[4]:
+ jolie
+ jim joe
+ 1 z 0.260476
+
+ [1 rows x 1 columns]
# lexically sorting
- df2 = df.sort_index()
- df2
- df2.index.lexsort_depth
- df2.loc[(1,'z')]
+ In [5]: df2 = df.sort_index()
+
+ In [6]: df2
+ Out[6]:
+ jolie
+ jim joe
+ 0 x 0.126970
+ x 0.966718
+ 1 y 0.897237
+ z 0.260476
+
+ [4 rows x 1 columns]
+
+ In [7]: df2.index.lexsort_depth
+ Out[7]: 2
+
+ In [8]: df2.loc[(1,'z')]
+ Out[8]:
+ jolie
+ jim joe
+ 1 z 0.260476
+
+ [1 rows x 1 columns]
- Bug in unique of Series with ``category`` dtype, which returned all categories regardless
whether they were "used" or not (see :issue:`8559` for the discussion).
diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst
index abbda2ffc9be2..fb71ec60a22f0 100644
--- a/doc/source/whatsnew/v0.17.0.rst
+++ b/doc/source/whatsnew/v0.17.0.rst
@@ -632,9 +632,10 @@ Of course you can coerce this as well.
To keep the previous behavior, you can use ``errors='ignore'``:
-.. ipython:: python
+.. code-block:: ipython
- pd.to_datetime(["2009-07-31", "asd"], errors="ignore")
+ In [4]: pd.to_datetime(["2009-07-31", "asd"], errors="ignore")
+ Out[4]: Index(['2009-07-31', 'asd'], dtype='object')
Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
has been deprecated in favor of ``errors='coerce'``.
@@ -726,10 +727,10 @@ be broadcast:
or it can return False if broadcasting can not be done:
-.. ipython:: python
- :okwarning:
+.. code-block:: ipython
- np.array([1, 2, 3]) == np.array([1, 2])
+ In [11]: np.array([1, 2, 3]) == np.array([1, 2])
+ Out[11]: False
Changes to boolean comparisons vs. None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -793,7 +794,7 @@ Previous behavior:
In [27]:
df_with_missing.to_hdf('file.h5',
- 'df_with_missing',
+ key='df_with_missing',
format='table',
mode='w')
@@ -809,7 +810,7 @@ New behavior:
.. ipython:: python
- df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w")
+ df_with_missing.to_hdf("file.h5", key="df_with_missing", format="table", mode="w")
pd.read_hdf("file.h5", "df_with_missing")
diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst
index 774d17e6ff6b0..93de83875746b 100644
--- a/doc/source/whatsnew/v0.17.1.rst
+++ b/doc/source/whatsnew/v0.17.1.rst
@@ -43,7 +43,7 @@ We've added *experimental* support for conditional HTML formatting:
the visual styling of a DataFrame based on the data.
The styling is accomplished with HTML and CSS.
Accesses the styler class with the :attr:`pandas.DataFrame.style`, attribute,
-an instance of :class:`~pandas.core.style.Styler` with your data attached.
+an instance of :class:`.Styler` with your data attached.
Here's a quick example:
@@ -58,7 +58,7 @@ We can render the HTML to get the following table.
.. raw:: html
:file: whatsnew_0171_html_table.html
-:class:`~pandas.core.style.Styler` interacts nicely with the Jupyter Notebook.
+:class:`.Styler` interacts nicely with the Jupyter Notebook.
See the :ref:`documentation ` for more.
.. _whatsnew_0171.enhancements:
diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst
index a05b9bb1a88ef..569197fe9daf5 100644
--- a/doc/source/whatsnew/v0.18.0.rst
+++ b/doc/source/whatsnew/v0.18.0.rst
@@ -808,11 +808,19 @@ Upsampling operations take you from a lower frequency to a higher frequency. The
performed with the ``Resampler`` objects with :meth:`~Resampler.backfill`,
:meth:`~Resampler.ffill`, :meth:`~Resampler.fillna` and :meth:`~Resampler.asfreq` methods.
-.. ipython:: python
+.. code-block:: ipython
- s = pd.Series(np.arange(5, dtype='int64'),
+ In [89]: s = pd.Series(np.arange(5, dtype='int64'),
index=pd.date_range('2010-01-01', periods=5, freq='Q'))
- s
+
+ In [90]: s
+ Out[90]:
+ 2010-03-31 0
+ 2010-06-30 1
+ 2010-09-30 2
+ 2010-12-31 3
+ 2011-03-31 4
+ Freq: Q-DEC, Length: 5, dtype: int64
Previously
@@ -837,9 +845,24 @@ Previously
New API
-.. ipython:: python
+.. code-block:: ipython
- s.resample('M').ffill()
+ In [91]: s.resample('M').ffill()
+ Out[91]:
+ 2010-03-31 0
+ 2010-04-30 0
+ 2010-05-31 0
+ 2010-06-30 1
+ 2010-07-31 1
+ 2010-08-31 1
+ 2010-09-30 2
+ 2010-10-31 2
+ 2010-11-30 2
+ 2010-12-31 3
+ 2011-01-31 3
+ 2011-02-28 3
+ 2011-03-31 4
+ Freq: M, Length: 13, dtype: int64
.. note::
@@ -985,10 +1008,16 @@ Other API changes
^^^^^^^^^^^^^^^^^
- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`)
- .. ipython:: python
+ .. code-block:: ipython
+
+ In [107]: s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10))
- s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10))
- s.between_time("7:00am", "9:00am")
+ In [108]: s.between_time("7:00am", "9:00am")
+ Out[108]:
+ 2015-01-01 07:00:00 7
+ 2015-01-01 08:00:00 8
+ 2015-01-01 09:00:00 9
+ Freq: H, Length: 3, dtype: int64
This will now raise.
diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
index 7d9008fdbdecd..85e0e63016729 100644
--- a/doc/source/whatsnew/v0.18.1.rst
+++ b/doc/source/whatsnew/v0.18.1.rst
@@ -77,9 +77,52 @@ Previously you would have to do this to get a rolling window mean per-group:
df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
df
-.. ipython:: python
+.. code-block:: ipython
- df.groupby("A").apply(lambda x: x.rolling(4).B.mean())
+ In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean())
+ Out[1]:
+ A
+ 1 0 NaN
+ 1 NaN
+ 2 NaN
+ 3 1.5
+ 4 2.5
+ 5 3.5
+ 6 4.5
+ 7 5.5
+ 8 6.5
+ 9 7.5
+ 10 8.5
+ 11 9.5
+ 12 10.5
+ 13 11.5
+ 14 12.5
+ 15 13.5
+ 16 14.5
+ 17 15.5
+ 18 16.5
+ 19 17.5
+ 2 20 NaN
+ 21 NaN
+ 22 NaN
+ 23 21.5
+ 24 22.5
+ 25 23.5
+ 26 24.5
+ 27 25.5
+ 28 26.5
+ 29 27.5
+ 30 28.5
+ 31 29.5
+ 3 32 NaN
+ 33 NaN
+ 34 NaN
+ 35 33.5
+ 36 34.5
+ 37 35.5
+ 38 36.5
+ 39 37.5
+ Name: B, dtype: float64
Now you can do:
@@ -101,15 +144,53 @@ For ``.resample(..)`` type of operations, previously you would have to:
df
-.. ipython:: python
+.. code-block:: ipython
- df.groupby("group").apply(lambda x: x.resample("1D").ffill())
+ In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill())
+ Out[1]:
+ group val
+ group date
+ 1 2016-01-03 1 5
+ 2016-01-04 1 5
+ 2016-01-05 1 5
+ 2016-01-06 1 5
+ 2016-01-07 1 5
+ 2016-01-08 1 5
+ 2016-01-09 1 5
+ 2016-01-10 1 6
+ 2 2016-01-17 2 7
+ 2016-01-18 2 7
+ 2016-01-19 2 7
+ 2016-01-20 2 7
+ 2016-01-21 2 7
+ 2016-01-22 2 7
+ 2016-01-23 2 7
+ 2016-01-24 2 8
Now you can do:
-.. ipython:: python
+.. code-block:: ipython
- df.groupby("group").resample("1D").ffill()
+ In[1]: df.groupby("group").resample("1D").ffill()
+ Out[1]:
+ group val
+ group date
+ 1 2016-01-03 1 5
+ 2016-01-04 1 5
+ 2016-01-05 1 5
+ 2016-01-06 1 5
+ 2016-01-07 1 5
+ 2016-01-08 1 5
+ 2016-01-09 1 5
+ 2016-01-10 1 6
+ 2 2016-01-17 2 7
+ 2016-01-18 2 7
+ 2016-01-19 2 7
+ 2016-01-20 2 7
+ 2016-01-21 2 7
+ 2016-01-22 2 7
+ 2016-01-23 2 7
+ 2016-01-24 2 8
.. _whatsnew_0181.enhancements.method_chain:
@@ -175,26 +256,78 @@ Partial string indexing on ``DatetimeIndex`` when part of a ``MultiIndex``
Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiIndex`` (:issue:`10331`)
-.. ipython:: python
+.. code-block:: ipython
- dft2 = pd.DataFrame(
- np.random.randn(20, 1),
- columns=["A"],
- index=pd.MultiIndex.from_product(
- [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]]
- ),
- )
- dft2
- dft2.loc["2013-01-05"]
+ In [20]: dft2 = pd.DataFrame(
+ ....: np.random.randn(20, 1),
+ ....: columns=["A"],
+ ....: index=pd.MultiIndex.from_product(
+ ....: [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]]
+ ....: ),
+ ....: )
+ ....:
+
+ In [21]: dft2
+ Out[21]:
+ A
+ 2013-01-01 00:00:00 a 0.469112
+ b -0.282863
+ 2013-01-01 12:00:00 a -1.509059
+ b -1.135632
+ 2013-01-02 00:00:00 a 1.212112
+ ... ...
+ 2013-01-04 12:00:00 b 0.271860
+ 2013-01-05 00:00:00 a -0.424972
+ b 0.567020
+ 2013-01-05 12:00:00 a 0.276232
+ b -1.087401
+
+ [20 rows x 1 columns]
+
+ In [22]: dft2.loc["2013-01-05"]
+ Out[22]:
+ A
+ 2013-01-05 00:00:00 a -0.424972
+ b 0.567020
+ 2013-01-05 12:00:00 a 0.276232
+ b -1.087401
+
+ [4 rows x 1 columns]
On other levels
-.. ipython:: python
+.. code-block:: ipython
- idx = pd.IndexSlice
- dft2 = dft2.swaplevel(0, 1).sort_index()
- dft2
- dft2.loc[idx[:, "2013-01-05"], :]
+ In [26]: idx = pd.IndexSlice
+
+ In [27]: dft2 = dft2.swaplevel(0, 1).sort_index()
+
+ In [28]: dft2
+ Out[28]:
+ A
+ a 2013-01-01 00:00:00 0.469112
+ 2013-01-01 12:00:00 -1.509059
+ 2013-01-02 00:00:00 1.212112
+ 2013-01-02 12:00:00 0.119209
+ 2013-01-03 00:00:00 -0.861849
+ ... ...
+ b 2013-01-03 12:00:00 1.071804
+ 2013-01-04 00:00:00 -0.706771
+ 2013-01-04 12:00:00 0.271860
+ 2013-01-05 00:00:00 0.567020
+ 2013-01-05 12:00:00 -1.087401
+
+ [20 rows x 1 columns]
+
+ In [29]: dft2.loc[idx[:, "2013-01-05"], :]
+ Out[29]:
+ A
+ a 2013-01-05 00:00:00 -0.424972
+ 2013-01-05 12:00:00 0.276232
+ b 2013-01-05 00:00:00 0.567020
+ 2013-01-05 12:00:00 -1.087401
+
+ [4 rows x 1 columns]
.. _whatsnew_0181.enhancements.assembling:
diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index d4b879f137698..1ae711113773f 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -329,11 +329,13 @@ These provide date offsets anchored (by default) to the 15th and end of month, a
**SemiMonthEnd**:
-.. ipython:: python
+.. code-block:: python
- pd.Timestamp("2016-01-01") + SemiMonthEnd()
+ In [46]: pd.Timestamp("2016-01-01") + SemiMonthEnd()
+ Out[46]: Timestamp('2016-01-15 00:00:00')
- pd.date_range("2015-01-01", freq="SM", periods=4)
+ In [47]: pd.date_range("2015-01-01", freq="SM", periods=4)
+ Out[47]: DatetimeIndex(['2015-01-15', '2015-01-31', '2015-02-15', '2015-02-28'], dtype='datetime64[ns]', freq='SM-15')
**SemiMonthBegin**:
@@ -345,11 +347,13 @@ These provide date offsets anchored (by default) to the 15th and end of month, a
Using the anchoring suffix, you can also specify the day of month to use instead of the 15th.
-.. ipython:: python
+.. code-block:: python
- pd.date_range("2015-01-01", freq="SMS-16", periods=4)
+ In [50]: pd.date_range("2015-01-01", freq="SMS-16", periods=4)
+ Out[50]: DatetimeIndex(['2015-01-01', '2015-01-16', '2015-02-01', '2015-02-16'], dtype='datetime64[ns]', freq='SMS-16')
- pd.date_range("2015-01-01", freq="SM-14", periods=4)
+ In [51]: pd.date_range("2015-01-01", freq="SM-14", periods=4)
+ Out[51]: DatetimeIndex(['2015-01-14', '2015-01-31', '2015-02-14', '2015-02-28'], dtype='datetime64[ns]', freq='SM-14')
.. _whatsnew_0190.enhancements.index:
@@ -498,8 +502,26 @@ Other enhancements
),
)
df
- df.resample("M", on="date")[["a"]].sum()
- df.resample("M", level="d")[["a"]].sum()
+
+ .. code-block:: ipython
+
+ In [74]: df.resample("M", on="date")[["a"]].sum()
+ Out[74]:
+ a
+ date
+ 2015-01-31 6
+ 2015-02-28 4
+
+ [2 rows x 1 columns]
+
+ In [75]: df.resample("M", level="d")[["a"]].sum()
+ Out[75]:
+ a
+ d
+ 2015-01-31 6
+ 2015-02-28 4
+
+ [2 rows x 1 columns]
- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`).
- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
index b4224785988e6..09bf5428d0432 100644
--- a/doc/source/whatsnew/v0.20.0.rst
+++ b/doc/source/whatsnew/v0.20.0.rst
@@ -614,11 +614,18 @@ New behavior:
``map`` on a ``Series`` with ``datetime64`` values may return ``int64`` dtypes rather than ``int32``
-.. ipython:: python
+.. code-block:: ipython
- s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H')
- .tz_localize('Asia/Tokyo'))
- s
+ In [64]: s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H')
+ ....: .tz_localize('Asia/Tokyo'))
+ ....:
+
+ In [65]: s
+ Out[65]:
+ 0 2011-01-02 00:00:00+09:00
+ 1 2011-01-02 01:00:00+09:00
+ 2 2011-01-02 02:00:00+09:00
+ Length: 3, dtype: datetime64[ns, Asia/Tokyo]
Previous behavior:
@@ -633,9 +640,14 @@ Previous behavior:
New behavior:
-.. ipython:: python
+.. code-block:: ipython
- s.map(lambda x: x.hour)
+ In [66]: s.map(lambda x: x.hour)
+ Out[66]:
+ 0 0
+ 1 1
+ 2 2
+ Length: 3, dtype: int64
.. _whatsnew_0200.api_breaking.index_dt_field:
@@ -659,10 +671,12 @@ Previous behaviour:
New behavior:
-.. ipython:: python
+.. code-block:: ipython
- idx = pd.date_range("2015-01-01", periods=5, freq='10H')
- idx.hour
+ In [67]: idx = pd.date_range("2015-01-01", periods=5, freq='10H')
+
+ In [68]: idx.hour
+ Out[68]: Index([0, 10, 20, 6, 16], dtype='int32')
This has the advantage that specific ``Index`` methods are still available on the
result. On the other hand, this might have backward incompatibilities: e.g.
@@ -872,11 +886,23 @@ This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622
This is *unchanged* from prior versions, but shown for illustration purposes:
-.. ipython:: python
+.. code-block:: python
- df = pd.DataFrame(np.arange(6), columns=['value'],
- index=pd.MultiIndex.from_product([list('BA'), range(3)]))
- df
+ In [81]: df = pd.DataFrame(np.arange(6), columns=['value'],
+ ....: index=pd.MultiIndex.from_product([list('BA'), range(3)]))
+ ....:
+ In [82]: df
+
+ Out[82]:
+ value
+ B 0 0
+ 1 1
+ 2 2
+ A 0 3
+ 1 4
+ 2 5
+
+ [6 rows x 1 columns]
.. code-block:: python
@@ -1059,7 +1085,7 @@ usually resulting in an invalid comparison, returning an empty result frame. The
.. ipython:: python
df = pd.DataFrame({'unparsed_date': ['2014-01-01', '2014-01-01']})
- df.to_hdf('store.h5', 'key', format='table', data_columns=True)
+ df.to_hdf('store.h5', key='key', format='table', data_columns=True)
df.dtypes
Previous behavior:
diff --git a/doc/source/whatsnew/v0.20.2.rst b/doc/source/whatsnew/v0.20.2.rst
index 430a39d2d2e97..6945b360a97b0 100644
--- a/doc/source/whatsnew/v0.20.2.rst
+++ b/doc/source/whatsnew/v0.20.2.rst
@@ -28,8 +28,8 @@ Enhancements
- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)
- ``Series`` provides a ``to_latex`` method (:issue:`16180`)
-- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`,
- parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`,
+- A new groupby method :meth:`.GroupBy.ngroup`,
+ parallel to the existing :meth:`.GroupBy.cumcount`,
has been added to return the group order (:issue:`11642`); see
:ref:`here `.
diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst
index f4976cc243f77..dad69b99ee6a4 100644
--- a/doc/source/whatsnew/v0.21.0.rst
+++ b/doc/source/whatsnew/v0.21.0.rst
@@ -306,7 +306,7 @@ Other enhancements
New functions or methods
""""""""""""""""""""""""
-- :meth:`~pandas.core.resample.Resampler.nearest` is added to support nearest-neighbor upsampling (:issue:`17496`).
+- :meth:`.Resampler.nearest` is added to support nearest-neighbor upsampling (:issue:`17496`).
- :class:`~pandas.Index` has added support for a ``to_frame`` method (:issue:`15230`).
New keywords
@@ -392,7 +392,7 @@ Sum/prod of all-NaN or empty Series/DataFrames is now consistently NaN
The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on
whether `bottleneck `__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`).
-Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `.
+Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `.
.. ipython:: python
@@ -635,17 +635,22 @@ Previous behavior:
New behavior:
-.. ipython:: python
+.. code-block:: ipython
- pi = pd.period_range('2017-01', periods=12, freq='M')
+ In [1]: pi = pd.period_range('2017-01', periods=12, freq='M')
- s = pd.Series(np.arange(12), index=pi)
+ In [2]: s = pd.Series(np.arange(12), index=pi)
- resampled = s.resample('2Q').mean()
+ In [3]: resampled = s.resample('2Q').mean()
- resampled
+ In [4]: resampled
+ Out[4]:
+ 2017Q1 2.5
+ 2017Q3 8.5
+ Freq: 2Q-DEC, dtype: float64
- resampled.index
+ In [5]: resampled.index
+ Out[5]: PeriodIndex(['2017Q1', '2017Q3'], dtype='period[2Q-DEC]')
Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior.
@@ -671,15 +676,35 @@ Previous behavior:
New behavior:
-.. ipython:: python
+.. code-block:: ipython
+
+ In [56]: pi = pd.period_range(start='2000-01-01', freq='D', periods=10)
- pi = pd.period_range(start='2000-01-01', freq='D', periods=10)
+ In [57]: s = pd.Series(np.arange(10), index=pi)
- s = pd.Series(np.arange(10), index=pi)
+ In [58]: s.resample('H').ohlc()
+ Out[58]:
+ open high low close
+ 2000-01-01 00:00 0.0 0.0 0.0 0.0
+ 2000-01-01 01:00 NaN NaN NaN NaN
+ 2000-01-01 02:00 NaN NaN NaN NaN
+ 2000-01-01 03:00 NaN NaN NaN NaN
+ 2000-01-01 04:00 NaN NaN NaN NaN
+ ... ... ... ... ...
+ 2000-01-10 19:00 NaN NaN NaN NaN
+ 2000-01-10 20:00 NaN NaN NaN NaN
+ 2000-01-10 21:00 NaN NaN NaN NaN
+ 2000-01-10 22:00 NaN NaN NaN NaN
+ 2000-01-10 23:00 NaN NaN NaN NaN
- s.resample('H').ohlc()
+ [240 rows x 4 columns]
+
+ In [59]: s.resample('M').ohlc()
+ Out[59]:
+ open high low close
+ 2000-01 0 9 0 9
- s.resample('M').ohlc()
+ [1 rows x 4 columns]
.. _whatsnew_0210.api_breaking.pandas_eval:
diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst
index c494b4f286662..a33a8f7addeef 100644
--- a/doc/source/whatsnew/v0.22.0.rst
+++ b/doc/source/whatsnew/v0.22.0.rst
@@ -187,16 +187,27 @@ entirely valid.
*pandas 0.22.0*
-.. ipython:: python
+.. code-block:: ipython
- idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"])
- pd.Series([1, 2], index=idx).resample("12H").sum()
+ In [14]: idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"])
+ In [15]: pd.Series([1, 2], index=idx).resample("12H").sum()
+ Out[15]:
+ 2017-01-01 00:00:00 1
+ 2017-01-01 12:00:00 0
+ 2017-01-02 00:00:00 2
+ Freq: 12H, Length: 3, dtype: int64
Once again, the ``min_count`` keyword is available to restore the 0.21 behavior.
-.. ipython:: python
+.. code-block:: ipython
+
+ In [16]: pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1)
+ Out[16]:
+ 2017-01-01 00:00:00 1.0
+ 2017-01-01 12:00:00 NaN
+ 2017-01-02 00:00:00 2.0
+ Freq: 12H, Length: 3, dtype: float64
- pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1)
Rolling and expanding
^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst
index 5aba5ec061e8b..808741ccf4475 100644
--- a/doc/source/whatsnew/v0.23.0.rst
+++ b/doc/source/whatsnew/v0.23.0.rst
@@ -286,12 +286,33 @@ For pivoting operations, this behavior is *already* controlled by the ``dropna``
df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
df
-.. ipython:: python
- pd.pivot_table(df, values='values', index=['A', 'B'],
- dropna=True)
- pd.pivot_table(df, values='values', index=['A', 'B'],
- dropna=False)
+.. code-block:: ipython
+
+ In [1]: pd.pivot_table(df, values='values', index=['A', 'B'], dropna=True)
+
+ Out[1]:
+ values
+ A B
+ a c 1.0
+ d 2.0
+ b c 3.0
+ d 4.0
+
+ In [2]: pd.pivot_table(df, values='values', index=['A', 'B'], dropna=False)
+
+ Out[2]:
+ values
+ A B
+ a c 1.0
+ d 2.0
+ y NaN
+ b c 3.0
+ d 4.0
+ y NaN
+ z c NaN
+ d NaN
+ y NaN
.. _whatsnew_0230.enhancements.window_raw:
@@ -299,8 +320,8 @@ For pivoting operations, this behavior is *already* controlled by the ``dropna``
Rolling/Expanding.apply() accepts ``raw=False`` to pass a ``Series`` to the function
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-:func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `,
-:func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have gained a ``raw=None`` parameter.
+:func:`Series.rolling().apply() <.Rolling.apply>`, :func:`DataFrame.rolling().apply() <.Rolling.apply>`,
+:func:`Series.expanding().apply() <.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <.Expanding.apply>` have gained a ``raw=None`` parameter.
This is similar to :func:`DataFame.apply`. This parameter, if ``True`` allows one to send a ``np.ndarray`` to the applied function. If ``False`` a ``Series`` will be passed. The
default is ``None``, which preserves backward compatibility, so this will default to ``True``, sending an ``np.ndarray``.
In a future version the default will be changed to ``False``, sending a ``Series``. (:issue:`5071`, :issue:`20584`)
@@ -524,7 +545,7 @@ Other enhancements
- ``Categorical.rename_categories``, ``CategoricalIndex.rename_categories`` and :attr:`Series.cat.rename_categories`
can now take a callable as their argument (:issue:`18862`)
- :class:`Interval` and :class:`IntervalIndex` have gained a ``length`` attribute (:issue:`18789`)
-- ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method.
+- ``Resampler`` objects now have a functioning :attr:`.Resampler.pipe` method.
Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`).
- :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`).
- :func:`DataFrame.pivot` now accepts a list for the ``values=`` kwarg (:issue:`17160`).
@@ -536,7 +557,7 @@ Other enhancements
- ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`)
- :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`)
-- Added :func:`pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing` and :func:`pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`)
+- Added :func:`.SeriesGroupBy.is_monotonic_increasing` and :func:`.SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`)
- For subclassed ``DataFrames``, :func:`DataFrame.apply` will now preserve the ``Series`` subclass (if defined) when passing the data to the applied function (:issue:`19822`)
- :func:`DataFrame.from_dict` now accepts a ``columns`` argument that can be used to specify the column names when ``orient='index'`` is used (:issue:`18529`)
- Added option ``display.html.use_mathjax`` so `MathJax `_ can be disabled when rendering tables in ``Jupyter`` notebooks (:issue:`19856`, :issue:`19824`)
@@ -547,7 +568,7 @@ Other enhancements
``SQLAlchemy`` dialects supporting multi-value inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
- :func:`read_html` now reads all ```` elements in a ````, not just the first. (:issue:`20690`)
-- :meth:`~pandas.core.window.Rolling.quantile` and :meth:`~pandas.core.window.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`)
+- :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`)
- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
- :class:`~pandas.tseries.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`).
- :class:`DataFrame` and :class:`Series` now support matrix multiplication (``@``) operator (:issue:`10259`) for Python>=3.5
@@ -1052,7 +1073,7 @@ Other API changes
- :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`)
- Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`).
- :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`)
-- A user-defined-function that is passed to :func:`Series.rolling().aggregate() `, :func:`DataFrame.rolling().aggregate() `, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`)
+- A user-defined-function that is passed to :func:`Series.rolling().aggregate() <.Rolling.aggregate>`, :func:`DataFrame.rolling().aggregate() <.Rolling.aggregate>`, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`)
- Rolling and Expanding types raise ``NotImplementedError`` upon iteration (:issue:`11704`).
.. _whatsnew_0230.deprecations:
@@ -1084,8 +1105,7 @@ Deprecations
- ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`)
- ``NDFrame.get_ftype_counts()`` is deprecated and will be removed in a future version (:issue:`18243`)
- The ``convert_datetime64`` parameter in :func:`DataFrame.to_records` has been deprecated and will be removed in a future version. The NumPy bug motivating this parameter has been resolved. The default value for this parameter has also changed from ``True`` to ``None`` (:issue:`18160`).
-- :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `,
- :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`)
+- :func:`Series.rolling().apply() <.Rolling.apply>`, :func:`DataFrame.rolling().apply() <.Rolling.apply>`, :func:`Series.expanding().apply() <.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <.Expanding.apply>` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`)
- The ``data``, ``base``, ``strides``, ``flags`` and ``itemsize`` properties
of the ``Series`` and ``Index`` classes have been deprecated and will be
removed in a future version (:issue:`20419`).
@@ -1159,15 +1179,15 @@ Performance improvements
- Improved performance of :func:`MultiIndex.remove_unused_levels` when there are no unused levels, at the cost of a reduction in performance when there are (:issue:`19289`)
- Improved performance of :func:`Index.get_loc` for non-unique indexes (:issue:`19478`)
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
-- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`)
+- Improved performance of :func:`.GroupBy.rank` (:issue:`15779`)
- Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`)
-- Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`)
-- Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`)
-- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`)
+- Improved performance of :func:`.GroupBy.ffill` and :func:`.GroupBy.bfill` (:issue:`11296`)
+- Improved performance of :func:`.GroupBy.any` and :func:`.GroupBy.all` (:issue:`15435`)
+- Improved performance of :func:`.GroupBy.pct_change` (:issue:`19165`)
- Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`)
- Improved performance of ``getattr(Series, attr)`` when the Series has certain index types. This manifested in slow printing of large Series with a ``DatetimeIndex`` (:issue:`19764`)
- Fixed a performance regression for :func:`GroupBy.nth` and :func:`GroupBy.last` with some object columns (:issue:`19283`)
-- Improved performance of :func:`pandas.core.arrays.Categorical.from_codes` (:issue:`18501`)
+- Improved performance of :func:`.Categorical.from_codes` (:issue:`18501`)
.. _whatsnew_0230.docs:
@@ -1412,13 +1432,13 @@ GroupBy/resample/rolling
- Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`)
- Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`)
- Bug in :func:`DataFrame.groupby` passing the ``on=`` kwarg, and subsequently using ``.apply()`` (:issue:`17813`)
-- Bug in :func:`DataFrame.resample().aggregate ` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`)
+- Bug in :func:`DataFrame.resample().aggregate <.Resampler.aggregate>` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`)
- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`)
- Bug in :func:`DataFrame.resample` that dropped timezone information (:issue:`13238`)
- Bug in :func:`DataFrame.groupby` where transformations using ``np.all`` and ``np.any`` were raising a ``ValueError`` (:issue:`20653`)
- Bug in :func:`DataFrame.resample` where ``ffill``, ``bfill``, ``pad``, ``backfill``, ``fillna``, ``interpolate``, and ``asfreq`` were ignoring ``loffset``. (:issue:`20744`)
- Bug in :func:`DataFrame.groupby` when applying a function that has mixed data types and the user supplied function can fail on the grouping column (:issue:`20949`)
-- Bug in :func:`DataFrameGroupBy.rolling().apply() ` where operations performed against the associated :class:`DataFrameGroupBy` object could impact the inclusion of the grouped item(s) in the result (:issue:`14013`)
+- Bug in :func:`DataFrameGroupBy.rolling().apply() <.Rolling.apply>` where operations performed against the associated :class:`DataFrameGroupBy` object could impact the inclusion of the grouped item(s) in the result (:issue:`14013`)
Sparse
^^^^^^
diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst
index b51368c87f991..685fe1b3836bf 100644
--- a/doc/source/whatsnew/v0.23.1.rst
+++ b/doc/source/whatsnew/v0.23.1.rst
@@ -100,8 +100,8 @@ Bug fixes
**Groupby/resample/rolling**
- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`)
-- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`)
-- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True``
+- Bug in :func:`.GroupBy.ffill` and :func:`.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`)
+- Bug in :func:`.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True``
- Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`)
**Data-type specific**
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 73a523b14f9f7..cb12962256a55 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -286,6 +286,7 @@ value. (:issue:`17054`)
.. ipython:: python
+ from io import StringIO
result = pd.read_html(StringIO("""
@@ -334,7 +335,7 @@ convenient way to apply users' predefined styling functions, and can help reduce
df.style.pipe(format_and_align).set_caption('Summary of results.')
Similar methods already exist for other classes in pandas, including :meth:`DataFrame.pipe`,
-:meth:`GroupBy.pipe() `, and :meth:`Resampler.pipe() `.
+:meth:`GroupBy.pipe() <.GroupBy.pipe>`, and :meth:`Resampler.pipe() <.Resampler.pipe>`.
.. _whatsnew_0240.enhancements.rename_axis:
@@ -404,7 +405,7 @@ Other enhancements
now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
and a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`)
- The result of :meth:`~DataFrame.resample` is now iterable similar to ``groupby()`` (:issue:`15314`).
-- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`pandas.core.resample.Resampler.quantile` (:issue:`15023`).
+- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`.Resampler.quantile` (:issue:`15023`).
- :meth:`DataFrame.resample` and :meth:`Series.resample` with a :class:`PeriodIndex` will now respect the ``base`` argument in the same fashion as with a :class:`DatetimeIndex`. (:issue:`23882`)
- :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
@@ -556,7 +557,7 @@ You must pass in the ``line_terminator`` explicitly, even in this case.
.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
-Proper handling of ``np.NaN`` in a string data-typed column with the Python engine
+Proper handling of ``np.nan`` in a string data-typed column with the Python engine
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
There was bug in :func:`read_excel` and :func:`read_csv` with the Python
@@ -1377,18 +1378,22 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`).
*New behavior*:
-.. ipython:: python
- :okexcept:
- :okwarning:
+.. code-block:: ipython
+
+ In [108]: ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour())
+
+ In[109]: ts + 2 * ts.freq
+ Out[109]: Timestamp('1994-05-06 14:15:16', freq='H')
+
+ In [110]: tdi = pd.timedelta_range('1D', periods=2)
- ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour())
- ts + 2 * ts.freq
+ In [111]: tdi - np.array([2 * tdi.freq, 1 * tdi.freq])
+ Out[111]: TimedeltaIndex(['-1 days', '1 days'], dtype='timedelta64[ns]', freq=None)
- tdi = pd.timedelta_range('1D', periods=2)
- tdi - np.array([2 * tdi.freq, 1 * tdi.freq])
+ In [112]: dti = pd.date_range('2001-01-01', periods=2, freq='7D')
- dti = pd.date_range('2001-01-01', periods=2, freq='7D')
- dti + pd.Index([1 * dti.freq, 2 * dti.freq])
+ In [113]: dti + pd.Index([1 * dti.freq, 2 * dti.freq])
+ Out[113]: DatetimeIndex(['2001-01-08', '2001-01-22'], dtype='datetime64[ns]', freq=None)
.. _whatsnew_0240.deprecations.integer_tz:
@@ -1544,7 +1549,7 @@ Performance improvements
shows similar speed improvements as above (:issue:`21659`)
- Improved performance of :meth:`CategoricalIndex.equals` when comparing to another :class:`CategoricalIndex` (:issue:`24023`)
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
-- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
+- Improved performance of :func:`.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
- Improved performance of :meth:`Series.at` and :meth:`Index.get_value` for Extension Arrays values (e.g. :class:`Categorical`) (:issue:`24204`)
- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex`
@@ -1852,28 +1857,28 @@ Plotting
GroupBy/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
-- Bug in :func:`pandas.core.window.Rolling.min` and :func:`pandas.core.window.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`)
-- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`)
+- Bug in :func:`.Rolling.min` and :func:`.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`)
+- Bug in :func:`.GroupBy.first` and :func:`.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`)
- Bug in :meth:`DateFrame.resample` when downsampling across a DST boundary (:issue:`8531`)
- Bug in date anchoring for :meth:`DateFrame.resample` with offset :class:`Day` when n > 1 (:issue:`24127`)
-- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a
+- Bug where ``ValueError`` is wrongly raised when calling :func:`.SeriesGroupBy.count` method of a
``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`).
-- Multiple bugs in :func:`pandas.core.window.Rolling.min` with ``closed='left'`` and a
+- Multiple bugs in :func:`.Rolling.min` with ``closed='left'`` and a
datetime-like index leading to incorrect results and also segfault. (:issue:`21704`)
-- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing positional arguments to applied func (:issue:`14615`).
+- Bug in :meth:`.Resampler.apply` when passing positional arguments to applied func (:issue:`14615`).
- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`).
-- Bug in :meth:`pandas.core.resample.Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`).
-- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`)
-- :func:`pandas.core.groupby.RollingGroupby.agg` and :func:`pandas.core.groupby.ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`)
+- Bug in :meth:`.Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`).
+- Bug in :meth:`.SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`)
+- :func:`.RollingGroupby.agg` and :func:`.ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`)
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`)
- Bug in :meth:`DataFrame.expanding` in which the ``axis`` argument was not being respected during aggregations (:issue:`23372`)
-- Bug in :meth:`pandas.core.groupby.GroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`).
-- Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`)
-- Bug in :meth:`pandas.core.groupby.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`).
-- Calling :meth:`pandas.core.groupby.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`)
+- Bug in :meth:`.GroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`).
+- Bug in :func:`.GroupBy.nth` where column order was not always preserved (:issue:`20760`)
+- Bug in :meth:`.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`).
+- Calling :meth:`.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`)
- Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`).
- Bug in :meth:`DataFrame.groupby` did not respect the ``observed`` argument when selecting a column and instead always used ``observed=False`` (:issue:`23970`)
-- Bug in :func:`pandas.core.groupby.SeriesGroupBy.pct_change` or :func:`pandas.core.groupby.DataFrameGroupBy.pct_change` would previously work across groups when calculating the percent change, where it now correctly works per group (:issue:`21200`, :issue:`21235`).
+- Bug in :func:`.SeriesGroupBy.pct_change` or :func:`.DataFrameGroupBy.pct_change` would previously work across groups when calculating the percent change, where it now correctly works per group (:issue:`21200`, :issue:`21235`).
- Bug preventing hash table creation with very large number (2^32) of rows (:issue:`22805`)
- Bug in groupby when grouping on categorical causes ``ValueError`` and incorrect grouping if ``observed=True`` and ``nan`` is present in categorical column (:issue:`24740`, :issue:`21151`).
@@ -1887,7 +1892,7 @@ Reshaping
- Bug in :meth:`DataFrame.where` with an empty DataFrame and empty ``cond`` having non-bool dtype (:issue:`21947`)
- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`)
- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`)
-- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
+- :func:`.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`)
- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`)
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
@@ -1905,7 +1910,7 @@ Reshaping
- Bug in :func:`pandas.concat` when joining ``Series`` datetimetz with ``Series`` category would lose timezone (:issue:`23816`)
- Bug in :meth:`DataFrame.join` when joining on partial MultiIndex would drop names (:issue:`20452`).
- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`)
-- Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`).
+- Constructing a DataFrame with an index argument that wasn't already an instance of :class:`.Index` was broken (:issue:`22227`).
- Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`)
- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a misleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`)
- Bug in :func:`DataFrame.unstack` where a ``ValueError`` was raised when unstacking timezone aware values (:issue:`18338`)
diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
index 36684d465373c..9a8c2ee5d00fa 100644
--- a/doc/source/whatsnew/v0.24.2.rst
+++ b/doc/source/whatsnew/v0.24.2.rst
@@ -54,7 +54,7 @@ Bug fixes
**Reshaping**
-- Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`)
+- Bug in :meth:`.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`)
- Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`)
**Visualization**
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index c0f169aa6251f..5cf5623f73036 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -89,7 +89,7 @@ GroupBy aggregation with multiple lambdas
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You can now provide multiple lambda functions to a list-like aggregation in
-:class:`pandas.core.groupby.GroupBy.agg` (:issue:`26430`).
+:class:`.GroupBy.agg` (:issue:`26430`).
.. ipython:: python
@@ -225,7 +225,7 @@ Other enhancements
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
- :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`)
-- :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`)
+- :meth:`.Rolling` supports exponential (or Poisson) window type (:issue:`21303`)
- Error message for missing required imports now includes the original import error's text (:issue:`23868`)
- :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`)
- :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
@@ -311,7 +311,7 @@ would be reassigned as -1. (:issue:`19387`)
``GroupBy.apply`` on ``DataFrame`` evaluates first group only once
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-The implementation of :meth:`DataFrameGroupBy.apply() `
+The implementation of :meth:`.DataFrameGroupBy.apply`
previously evaluated the supplied function consistently twice on the first group
to infer if it is safe to use a fast code path. Particularly for functions with
side effects, this was an undesired behavior and may have led to surprises. (:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, :issue:`20084`, :issue:`21417`)
@@ -493,7 +493,7 @@ values are coerced to floating point, which may result in loss of precision. See
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The methods ``ffill``, ``bfill``, ``pad`` and ``backfill`` of
-:class:`DataFrameGroupBy `
+:class:`.DataFrameGroupBy`
previously included the group labels in the return value, which was
inconsistent with other groupby transforms. Now only the filled values
are returned. (:issue:`21521`)
@@ -885,8 +885,8 @@ Other API changes
- :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`)
- Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`)
- Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`)
-- The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`)
-- The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`)
+- The ``arg`` argument in :meth:`.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`)
+- The ``arg`` argument in :meth:`.Window.aggregate` has been renamed to ``func`` (:issue:`26372`)
- Most pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`)
- The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`)
- Removed support of gtk package for clipboards (:issue:`26563`)
@@ -991,7 +991,7 @@ Performance improvements
- :meth:`DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`)
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
-- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
+- Improved performance of :meth:`.GroupBy.quantile` (:issue:`20405`)
- Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`, :issue:`26722`)
- :class:`RangeIndex` now performs standard lookup without instantiating an actual hashtable, hence saving memory (:issue:`16685`)
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
@@ -1117,7 +1117,7 @@ Indexing
- Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` where ``KeyError`` was not raised for a ``MultiIndex`` when the key was less than or equal to the number of levels in the :class:`MultiIndex` (:issue:`14885`).
- Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`).
- Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`).
-- Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`)
+- Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`.DataFrame` would raise error (:issue:`26390`)
- Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`)
- Fixed a ``KeyError`` when indexing a :class:`MultiIndex` level with a list containing exactly one label, which is missing (:issue:`27148`)
- Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`)
@@ -1190,28 +1190,28 @@ Plotting
GroupBy/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
-- Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`)
-- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`)
-- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying an aggregation function to timezone aware data (:issue:`23683`)
-- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
-- Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`)
+- Bug in :meth:`.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`)
+- Bug in :meth:`.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`)
+- Bug in :func:`.GroupBy.agg` when applying an aggregation function to timezone aware data (:issue:`23683`)
+- Bug in :func:`.GroupBy.first` and :func:`.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
+- Bug in :func:`.GroupBy.size` when grouping only NA values (:issue:`23050`)
- Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`)
- Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`)
- Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`)
- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)
-- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`)
-- Bug in :meth:`pandas.core.window.Rolling.count` and ``pandas.core.window.Expanding.count`` was previously ignoring the ``axis`` keyword (:issue:`13503`)
-- Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
-- Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`)
-- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`)
-- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`)
-- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`)
-- Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`)
-- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`)
-- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`)
-- Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.ExponentialMovingWindow` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`)
-- Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`)
-- Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`pandas.core.window.Window.aggregate` (:issue:`26597`)
+- Bug in :meth:`.Rolling.min` and :meth:`.Rolling.max` that caused a memory leak (:issue:`25893`)
+- Bug in :meth:`.Rolling.count` and ``.Expanding.count`` was previously ignoring the ``axis`` keyword (:issue:`13503`)
+- Bug in :meth:`.GroupBy.idxmax` and :meth:`.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
+- Bug in :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`)
+- Bug in :meth:`.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`)
+- Bug in :meth:`.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`)
+- Bug in :meth:`.DataFrame.groupby` where passing a :class:`.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`)
+- Bug in :meth:`.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`)
+- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`)
+- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`)
+- Improved :class:`.Rolling`, :class:`.Window` and :class:`.ExponentialMovingWindow` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`)
+- Bug in :meth:`.Rolling.max` and :meth:`.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`)
+- Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`.Window.aggregate` (:issue:`26597`)
Reshaping
^^^^^^^^^
diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst
index cc24ba5d6557c..67017d7c9fb29 100644
--- a/doc/source/whatsnew/v0.25.1.rst
+++ b/doc/source/whatsnew/v0.25.1.rst
@@ -86,10 +86,10 @@ GroupBy/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`)
-- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
-- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`)
+- Bug in :meth:`.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
+- Bug in :meth:`.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`)
- Bug in windowing over read-only arrays (:issue:`27766`)
-- Fixed segfault in ``pandas.core.groupby.DataFrameGroupBy.quantile`` when an invalid quantile was passed (:issue:`27470`)
+- Fixed segfault in ``.DataFrameGroupBy.quantile`` when an invalid quantile was passed (:issue:`27470`)
Reshaping
^^^^^^^^^
diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst
index ab6aaebe4ed06..5bd00f17bd3c5 100644
--- a/doc/source/whatsnew/v0.25.2.rst
+++ b/doc/source/whatsnew/v0.25.2.rst
@@ -31,8 +31,8 @@ IO
GroupBy/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
-- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
-- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
+- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`.DataFrameGroupBy.quantile` (:issue:`28113`).
+- Bug in :meth:`.GroupBy.shift`, :meth:`.GroupBy.bfill` and :meth:`.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
Other
^^^^^
diff --git a/doc/source/whatsnew/v0.4.x.rst b/doc/source/whatsnew/v0.4.x.rst
index 0ed7bb396674e..83f6a6907f33c 100644
--- a/doc/source/whatsnew/v0.4.x.rst
+++ b/doc/source/whatsnew/v0.4.x.rst
@@ -11,8 +11,7 @@ New features
- Added Python 3 support using 2to3 (:issue:`200`)
- :ref:`Added ` ``name`` attribute to ``Series``, now
prints as part of ``Series.__repr__``
-- :ref:`Added ` instance methods ``isnull`` and ``notnull`` to
- Series (:issue:`209`, :issue:`203`)
+- :meth:`Series.isnull`` and :meth:`Series.notnull` (:issue:`209`, :issue:`203`)
- :ref:`Added ` ``Series.align`` method for aligning two series
with choice of join method (ENH56_)
- :ref:`Added ` method ``get_level_values`` to
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 5e4559303a8b7..94a8ee7cd1a5d 100755
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -242,7 +242,7 @@ Other enhancements
- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`)
- :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`)
- :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`)
-- Implemented :meth:`pandas.core.window.Window.var` and :meth:`pandas.core.window.Window.std` functions (:issue:`26597`)
+- Implemented :meth:`.Window.var` and :meth:`.Window.std` functions (:issue:`26597`)
- Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
- Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
- :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
@@ -987,7 +987,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
- The 'outer' method on Numpy ufuncs, e.g. ``np.subtract.outer`` operating on :class:`Series` objects is no longer supported, and will raise ``NotImplementedError`` (:issue:`27198`)
- Removed ``Series.get_dtype_counts`` and ``DataFrame.get_dtype_counts`` (:issue:`27145`)
- Changed the default "fill_value" argument in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`)
-- Changed the default value for the ``raw`` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`)
+- Changed the default value for the ``raw`` argument in :func:`Series.rolling().apply() <.Rolling.apply>`, :func:`DataFrame.rolling().apply() <.Rolling.apply>`, :func:`Series.expanding().apply() <.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <.Expanding.apply>` from ``None`` to ``False`` (:issue:`20584`)
- Removed deprecated behavior of :meth:`Series.argmin` and :meth:`Series.argmax`, use :meth:`Series.idxmin` and :meth:`Series.idxmax` for the old behavior (:issue:`16955`)
- Passing a tz-aware ``datetime.datetime`` or :class:`Timestamp` into the :class:`Timestamp` constructor with the ``tz`` argument now raises a ``ValueError`` (:issue:`23621`)
- Removed ``Series.base``, ``Index.base``, ``Categorical.base``, ``Series.flags``, ``Index.flags``, ``PeriodArray.flags``, ``Series.strides``, ``Index.strides``, ``Series.itemsize``, ``Index.itemsize``, ``Series.data``, ``Index.data`` (:issue:`20721`)
@@ -1079,7 +1079,7 @@ Datetimelike
- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`)
- Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`)
- Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`)
-- Bug in :func:`pandas.to_datetime` failing for ``deques`` when using ``cache=True`` (the default) (:issue:`29403`)
+- Bug in :func:`pandas.to_datetime` failing for ``deque`` objects when using ``cache=True`` (the default) (:issue:`29403`)
- Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`)
- Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`)
- Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`)
@@ -1217,7 +1217,7 @@ GroupBy/resample/rolling
- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`)
- Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`)
-- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`)
+- Bug in :meth:`.Resampler.size` and :meth:`.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`)
- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`)
- Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue:`15584`).
- Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue:`19248`).
diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
index 340fa97a513ad..0a5716f52c836 100644
--- a/doc/source/whatsnew/v1.0.2.rst
+++ b/doc/source/whatsnew/v1.0.2.rst
@@ -19,8 +19,8 @@ Fixed regressions
- Fixed regression in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` which were failing on frames with :class:`MultiIndex` columns and a custom function (:issue:`31777`)
- Fixed regression in ``groupby(..).rolling(..).apply()`` (``RollingGroupby``) where the ``raw`` parameter was ignored (:issue:`31754`)
-- Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`)
-- Fixed regression in :meth:`groupby(..).nunique() ` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
+- Fixed regression in :meth:`rolling(..).corr() <.Rolling.corr>` when using a time offset (:issue:`31789`)
+- Fixed regression in :meth:`groupby(..).nunique() <.DataFrameGroupBy.nunique>` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
- Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`)
- Fixed regression in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`)
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index dd566eaab1e75..37d021efddf0b 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -313,9 +313,9 @@ Other enhancements
- :meth:`melt` has gained an ``ignore_index`` (default ``True``) argument that, if set to ``False``, prevents the method from dropping the index (:issue:`17440`).
- :meth:`Series.update` now accepts objects that can be coerced to a :class:`Series`,
such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`)
-- :meth:`~pandas.core.groupby.DataFrameGroupBy.transform` and :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` have gained ``engine`` and ``engine_kwargs`` arguments that support executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
-- :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
-- :class:`~pandas.core.groupby.DataFrameGroupBy` and :class:`~pandas.core.groupby.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`)
+- :meth:`.DataFrameGroupBy.transform` and :meth:`.DataFrameGroupBy.aggregate` have gained ``engine`` and ``engine_kwargs`` arguments that support executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
+- :meth:`.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
+- :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`)
- :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`)
- Added :class:`api.extension.ExtensionArray.equals` to the extension array interface, similar to :meth:`Series.equals` (:issue:`27081`)
- The minimum supported dta version has increased to 105 in :func:`read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`).
@@ -327,10 +327,10 @@ Other enhancements
and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`).
- :meth:`HDFStore.put` now accepts a ``track_times`` parameter. This parameter is passed to the ``create_table`` method of ``PyTables`` (:issue:`32682`).
- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts ``xlabel`` and ``ylabel`` parameters to present labels on x and y axis (:issue:`9093`).
-- Made :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`)
+- Made :class:`.Rolling` and :class:`.Expanding` iterable(:issue:`11704`)
- Made ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`).
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`)
-- :meth:`~pandas.core.groupby.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
+- :meth:`.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
- :func:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`).
- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`)
- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example
@@ -344,7 +344,7 @@ Other enhancements
- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similar to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`).
- :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` now accept ``index`` argument as an alias for tabulate's ``showindex`` (:issue:`32667`)
- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable Boolean dtype (:issue:`34859`)
-- :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`)
+- :class:`.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`)
- :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`)
- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`)
- :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for ``yerr`` and/or ``xerr``, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`)
@@ -629,7 +629,7 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma
df.groupby("a", as_index=False).nunique()
-The method :meth:`~pandas.core.groupby.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a :class:`DataFrame` instead of a :class:`Series`. (:issue:`32599`)
+The method :meth:`.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a :class:`DataFrame` instead of a :class:`Series`. (:issue:`32599`)
*Previous behavior*:
@@ -650,10 +650,10 @@ The method :meth:`~pandas.core.groupby.DataFrameGroupBy.size` would previously i
.. _whatsnew_110.api_breaking.groupby_results_lost_as_index_false:
-:meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost results with ``as_index=False`` when relabeling columns
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:meth:`.DataFrameGroupby.agg` lost results with ``as_index=False`` when relabeling columns
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Previously :meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was
+Previously :meth:`.DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was
set to ``False`` and the result columns were relabeled. In this case the result values were replaced with
the previous index (:issue:`32240`).
@@ -878,14 +878,14 @@ Performance improvements
sparse values from ``scipy.sparse`` matrices using the
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
-- Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first`
- and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`)
+- Performance improvement for groupby methods :meth:`.Groupby.first`
+ and :meth:`.Groupby.last` (:issue:`34178`)
- Performance improvement in :func:`factorize` for nullable (integer and Boolean) dtypes (:issue:`33064`).
- Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`)
- Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`)
- Performance improvement in reductions (``sum``, ``prod``, ``min``, ``max``) for nullable (integer and Boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
-- Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`)
+- Performance improvement in :class:`.RollingGroupby` (:issue:`34052`)
- Performance improvement in arithmetic operations (``sub``, ``add``, ``mul``, ``div``) for :class:`MultiIndex` (:issue:`34297`)
- Performance improvement in ``DataFrame[bool_indexer]`` when ``bool_indexer`` is a ``list`` (:issue:`33924`)
- Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`)
diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst
index 77ea67f76f655..176a8b85e76b4 100644
--- a/doc/source/whatsnew/v1.1.1.rst
+++ b/doc/source/whatsnew/v1.1.1.rst
@@ -30,7 +30,7 @@ Fixed regressions
- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index``, ``right_index`` and ``tolerance`` were set (:issue:`35558`)
- Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`)
- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`)
-- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`)
+- Fixed regression in :meth:`.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`)
- Fixed memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`)
.. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index fc8b59e11e001..12ab4f27d1e62 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -793,13 +793,13 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`)
- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`)
- Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`)
-- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`)
+- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`)
- Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`)
- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`)
- Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
- Bug in :meth:`.Rolling.sum` returned wrong values when dtypes where mixed between float and integer and ``axis=1`` (:issue:`20649`, :issue:`35596`)
- Bug in :meth:`.Rolling.count` returned ``np.nan`` with :class:`~pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in the window (:issue:`35579`)
-- Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`)
+- Bug where :class:`.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`)
- Bug in :meth:`.DataFrameGroupBy.ffill` and :meth:`.DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`)
- Bug in :meth:`.RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`)
- Bug in :meth:`.DataFrameGroupBy.rolling` returning wrong values with partial centered window (:issue:`36040`)
diff --git a/doc/source/whatsnew/v1.4.2.rst b/doc/source/whatsnew/v1.4.2.rst
index 64c36632bfefe..64d7e683c9d27 100644
--- a/doc/source/whatsnew/v1.4.2.rst
+++ b/doc/source/whatsnew/v1.4.2.rst
@@ -33,7 +33,7 @@ Bug fixes
- Fix some cases for subclasses that define their ``_constructor`` properties as general callables (:issue:`46018`)
- Fixed "longtable" formatting in :meth:`.Styler.to_latex` when ``column_format`` is given in extended format (:issue:`46037`)
- Fixed incorrect rendering in :meth:`.Styler.format` with ``hyperlinks="html"`` when the url contains a colon or other special characters (:issue:`46389`)
-- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
+- Improved error message in :class:`.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
.. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 44728e7e552ab..8fa1361cc30c1 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -112,14 +112,33 @@ to the index in the resample when :meth:`.Resampler.apply` is used.
of pandas, not specifying ``group_keys`` will default to
the same behavior as ``group_keys=False``.
-.. ipython:: python
+.. code-block:: ipython
- df = pd.DataFrame(
- {'a': range(6)},
- index=pd.date_range("2021-01-01", periods=6, freq="8H")
- )
- df.resample("D", group_keys=True).apply(lambda x: x)
- df.resample("D", group_keys=False).apply(lambda x: x)
+ In [11]: df = pd.DataFrame(
+ ....: {'a': range(6)},
+ ....: index=pd.date_range("2021-01-01", periods=6, freq="8H")
+ ....: )
+ ....:
+
+ In [12]: df.resample("D", group_keys=True).apply(lambda x: x)
+ Out[12]:
+ a
+ 2021-01-01 2021-01-01 00:00:00 0
+ 2021-01-01 08:00:00 1
+ 2021-01-01 16:00:00 2
+ 2021-01-02 2021-01-02 00:00:00 3
+ 2021-01-02 08:00:00 4
+ 2021-01-02 16:00:00 5
+
+ In [13]: df.resample("D", group_keys=False).apply(lambda x: x)
+ Out[13]:
+ a
+ 2021-01-01 00:00:00 0
+ 2021-01-01 08:00:00 1
+ 2021-01-01 16:00:00 2
+ 2021-01-02 00:00:00 3
+ 2021-01-02 08:00:00 4
+ 2021-01-02 16:00:00 5
Previously, the resulting index would depend upon the values returned by ``apply``,
as seen in the following example.
@@ -461,19 +480,20 @@ upon serialization. (Related issue :issue:`12997`)
*Old Behavior*
-.. ipython:: python
+.. code-block:: ipython
- index = pd.date_range(
- start='2020-12-28 00:00:00',
- end='2020-12-28 02:00:00',
- freq='1H',
- )
- a = pd.Series(
- data=range(3),
- index=index,
- )
+ In [32]: index = pd.date_range(
+ ....: start='2020-12-28 00:00:00',
+ ....: end='2020-12-28 02:00:00',
+ ....: freq='1H',
+ ....: )
+ ....:
-.. code-block:: ipython
+ In [33]: a = pd.Series(
+ ....: data=range(3),
+ ....: index=index,
+ ....: )
+ ....:
In [4]: from io import StringIO
@@ -485,12 +505,16 @@ upon serialization. (Related issue :issue:`12997`)
*New Behavior*
-.. ipython:: python
+.. code-block:: ipython
+
+ In [34]: from io import StringIO
+
+ In [35]: a.to_json(date_format='iso')
+ Out[35]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}'
- from io import StringIO
- a.to_json(date_format='iso')
# Roundtripping now works
- pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index
+ In [36]: pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index
+ Out[36]: array([ True, True, True])
.. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical:
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 7d26005315c33..be63da09846c8 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -76,7 +76,7 @@ Below is a possibly non-exhaustive list of changes:
.. ipython:: python
- idx = pd.date_range(start='1/1/2018', periods=3, freq='M')
+ idx = pd.date_range(start='1/1/2018', periods=3, freq='ME')
idx.array.year
idx.year
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index 313bf61ecabf9..d4eb5742ef928 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -1,6 +1,6 @@
.. _whatsnew_210:
-What's new in 2.1.0 (Aug 11, 2023)
+What's new in 2.1.0 (Aug 30, 2023)
--------------------------------------
These are the changes in pandas 2.1.0. See :ref:`release` for a full changelog
@@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0
`PyArrow `_ will become a required
dependency of pandas starting with pandas 3.0. This decision was made based on
-`PDEP 12 `_.
+`PDEP 10 `_.
This will enable more changes that are hugely beneficial to pandas users, including
but not limited to:
@@ -39,12 +39,18 @@ We are collecting feedback on this decision `here ` (:issue:`48347`)
+- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype` (:issue:`52201`)
+- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`.ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`)
- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
- Improved error handling when using :meth:`DataFrame.to_json` with incompatible ``index`` and ``orient`` arguments (:issue:`52143`)
-- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
-- Improved error message when providing an invalid ``index`` or ``offset`` argument to :class:`pandas.api.indexers.VariableOffsetWindowIndexer` (:issue:`54379`)
+- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns (:issue:`52084`)
+- Improved error message when providing an invalid ``index`` or ``offset`` argument to :class:`.VariableOffsetWindowIndexer` (:issue:`54379`)
- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
- Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`).
- :meth:`DataFrame.shift` and :meth:`Series.shift` now allow shifting by multiple periods by supplying a list of periods (:issue:`44424`)
-- Groupby aggregations (such as :meth:`.DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`)
+- Groupby aggregations with ``numba`` (such as :meth:`.DataFrameGroupBy.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`)
- Improved error message when :meth:`.DataFrameGroupBy.agg` failed (:issue:`52930`)
-- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
-- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`)
+- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to ``lzma.LZMAFile`` (:issue:`52979`)
+- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`)
- :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`)
+- :meth:`Index.all` and :meth:`Index.any` with floating dtypes and timedelta64 dtypes no longer raise ``TypeError``, matching the :meth:`Series.all` and :meth:`Series.any` behavior (:issue:`54566`)
+- :meth:`Series.cummax`, :meth:`Series.cummin` and :meth:`Series.cumprod` are now supported for pyarrow dtypes with pyarrow version 13.0 and above (:issue:`52085`)
- Added support for the DataFrame Consortium Standard (:issue:`54383`)
-- Performance improvement in :meth:`.GroupBy.quantile` (:issue:`51722`)
-
-.. ---------------------------------------------------------------------------
-.. _whatsnew_210.notable_bug_fixes:
-
-Notable bug fixes
-~~~~~~~~~~~~~~~~~
-
-These are bug fixes that might have notable behavior changes.
+- Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`)
+- PyArrow-backed integer dtypes now support bitwise operations (:issue:`54495`)
.. ---------------------------------------------------------------------------
.. _whatsnew_210.api_breaking:
@@ -363,7 +365,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
Other API changes
^^^^^^^^^^^^^^^^^
-- :class:`arrays.PandasArray` has been renamed ``NumpyExtensionArray`` and the attached dtype name changed from ``PandasDtype`` to ``NumpyEADtype``; importing ``PandasArray`` still works until the next major version (:issue:`53694`)
+- :class:`arrays.PandasArray` has been renamed :class:`.NumpyExtensionArray` and the attached dtype name changed from ``PandasDtype`` to ``NumpyEADtype``; importing ``PandasArray`` still works until the next major version (:issue:`53694`)
.. ---------------------------------------------------------------------------
.. _whatsnew_210.deprecations:
@@ -374,6 +376,8 @@ Deprecations
Deprecated silent upcasting in setitem-like Series operations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PDEP-6: https://pandas.pydata.org/pdeps/0006-ban-upcasting.html
+
Setitem-like operations on Series (or DataFrame columns) which silently upcast the dtype are
deprecated and show a warning. Examples of affected operations are:
@@ -428,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d
In [3]: ser[0] = 'not an int64'
FutureWarning:
- Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas.
+ Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas.
Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
In [4]: ser
@@ -506,34 +510,33 @@ and ``datetime.datetime.strptime``:
Other Deprecations
^^^^^^^^^^^^^^^^^^
-- Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`)
-- Deprecated 'downcast' keyword in :meth:`Index.fillna` (:issue:`53956`)
-- Deprecated 'fill_method' and 'limit' keywords in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`.DataFrameGroupBy.pct_change`, and :meth:`.SeriesGroupBy.pct_change`, explicitly call ``ffill`` or ``bfill`` before calling ``pct_change`` instead (:issue:`53491`)
-- Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`)
-- Deprecated 'quantile' keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed as 'q' instead (:issue:`52550`)
- Deprecated :attr:`.DataFrameGroupBy.dtypes`, check ``dtypes`` on the underlying object instead (:issue:`51045`)
- Deprecated :attr:`DataFrame._data` and :attr:`Series._data`, use public APIs instead (:issue:`33333`)
- Deprecated :func:`concat` behavior when any of the objects being concatenated have length 0; in the past the dtypes of empty objects were ignored when determining the resulting dtype, in a future version they will not (:issue:`39122`)
+- Deprecated :meth:`.Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`)
- Deprecated :meth:`.DataFrameGroupBy.all` and :meth:`.DataFrameGroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`)
-- Deprecated :meth:`.DataFrameGroupBy.apply` and methods on the objects returned by :meth:`.DataFrameGroupBy.resample` operating on the grouping column(s); select the columns to operate on after groupby to either explicitly include or exclude the groupings and avoid the ``FutureWarning`` (:issue:`7155`)
-- Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`)
- Deprecated ``axis=1`` in :meth:`DataFrame.ewm`, :meth:`DataFrame.rolling`, :meth:`DataFrame.expanding`, transpose before calling the method instead (:issue:`51778`)
- Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`)
+- Deprecated ``broadcast_axis`` keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`)
+- Deprecated ``downcast`` keyword in :meth:`Index.fillna` (:issue:`53956`)
+- Deprecated ``fill_method`` and ``limit`` keywords in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`.DataFrameGroupBy.pct_change`, and :meth:`.SeriesGroupBy.pct_change`, explicitly call e.g. :meth:`DataFrame.ffill` or :meth:`DataFrame.bfill` before calling ``pct_change`` instead (:issue:`53491`)
+- Deprecated ``method``, ``limit``, and ``fill_axis`` keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call :meth:`DataFrame.fillna` or :meth:`Series.fillna` on the alignment results instead (:issue:`51856`)
+- Deprecated ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead (:issue:`52550`)
- Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`)
- Deprecated behavior of :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin` in with all-NA entries or any-NA and ``skipna=False``; in a future version these will raise ``ValueError`` (:issue:`51276`)
- Deprecated explicit support for subclassing :class:`Index` (:issue:`45289`)
-- Deprecated making functions given to :meth:`Series.agg` attempt to operate on each element in the :class:`Series` and only operate on the whole :class:`Series` if the elementwise operations failed. In the future, functions given to :meth:`Series.agg` will always operate on the whole :class:`Series` only. To keep the current behavior, use :meth:`Series.transform` instead. (:issue:`53325`)
-- Deprecated making the functions in a list of functions given to :meth:`DataFrame.agg` attempt to operate on each element in the :class:`DataFrame` and only operate on the columns of the :class:`DataFrame` if the elementwise operations failed. To keep the current behavior, use :meth:`DataFrame.transform` instead. (:issue:`53325`)
+- Deprecated making functions given to :meth:`Series.agg` attempt to operate on each element in the :class:`Series` and only operate on the whole :class:`Series` if the elementwise operations failed. In the future, functions given to :meth:`Series.agg` will always operate on the whole :class:`Series` only. To keep the current behavior, use :meth:`Series.transform` instead (:issue:`53325`)
+- Deprecated making the functions in a list of functions given to :meth:`DataFrame.agg` attempt to operate on each element in the :class:`DataFrame` and only operate on the columns of the :class:`DataFrame` if the elementwise operations failed. To keep the current behavior, use :meth:`DataFrame.transform` instead (:issue:`53325`)
- Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`)
- Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`)
-- Deprecated the "downcast" keyword in :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`Series.ffill`, :meth:`DataFrame.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.bfill` (:issue:`40988`)
- Deprecated the ``axis`` keyword in :meth:`DataFrame.ewm`, :meth:`Series.ewm`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.expanding` (:issue:`51778`)
- Deprecated the ``axis`` keyword in :meth:`DataFrame.resample`, :meth:`Series.resample` (:issue:`51778`)
+- Deprecated the ``downcast`` keyword in :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`Series.ffill`, :meth:`DataFrame.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.bfill` (:issue:`40988`)
- Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`)
- Deprecated the behavior of :meth:`Series.argsort` in the presence of NA values; in a future version these will be sorted at the end instead of giving -1 (:issue:`54219`)
- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`)
-- Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`)
-- Deprecated the 'axis' keyword in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.fillna`, :meth:`.DataFrameGroupBy.take`, :meth:`.DataFrameGroupBy.skew`, :meth:`.DataFrameGroupBy.rank`, :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cumsum`, :meth:`.DataFrameGroupBy.cummax`, :meth:`.DataFrameGroupBy.cummin`, :meth:`.DataFrameGroupBy.pct_change`, :meth:`DataFrameGroupBy.diff`, :meth:`.DataFrameGroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`)
+- Deprecating pinning ``group.name`` to each group in :meth:`.SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`)
+- Deprecated the ``axis`` keyword in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.fillna`, :meth:`.DataFrameGroupBy.take`, :meth:`.DataFrameGroupBy.skew`, :meth:`.DataFrameGroupBy.rank`, :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cumsum`, :meth:`.DataFrameGroupBy.cummax`, :meth:`.DataFrameGroupBy.cummin`, :meth:`.DataFrameGroupBy.pct_change`, :meth:`.DataFrameGroupBy.diff`, :meth:`.DataFrameGroupBy.shift`, and :meth:`.DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`)
- Deprecated :class:`.DataFrameGroupBy` with ``as_index=False`` not including groupings in the result when they are not columns of the DataFrame (:issue:`49519`)
- Deprecated :func:`is_categorical_dtype`, use ``isinstance(obj.dtype, pd.CategoricalDtype)`` instead (:issue:`52527`)
- Deprecated :func:`is_datetime64tz_dtype`, check ``isinstance(dtype, pd.DatetimeTZDtype)`` instead (:issue:`52607`)
@@ -545,49 +548,49 @@ Other Deprecations
- Deprecated :meth:`.Styler.applymap`. Use the new :meth:`.Styler.map` method instead (:issue:`52708`)
- Deprecated :meth:`DataFrame.applymap`. Use the new :meth:`DataFrame.map` method instead (:issue:`52353`)
- Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`)
-- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`)
-- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`)
-- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series` before calling (:issue:`52986`)
+- Deprecated ``freq`` parameter in :class:`.PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`)
+- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`.ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`)
+- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`.ExtensionArray`, or :class:`Series` before calling (:issue:`52986`)
- Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`)
- Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`)
-- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`)
-- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`)
-- Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`)
+- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :attr:`Series.dt` properties (:issue:`20306`)
+- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or NumPy array before operating instead (:issue:`51521`)
- Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`)
- Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`)
-- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`)
+- Deprecated the ``fastpath`` keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`)
- Deprecated the behavior of :func:`is_bool_dtype` returning ``True`` for object-dtype :class:`Index` of bool objects (:issue:`52680`)
- Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`)
-- Deprecated unused "closed" and "normalize" keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`)
-- Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
-- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
+- Deprecated unused ``closed`` and ``normalize`` keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`)
+- Deprecated unused ``closed`` keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
+- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs (:issue:`52500`, :issue:`52538`)
- Deprecated :class:`Period` and :class:`PeriodDtype` with ``BDay`` freq, use a :class:`DatetimeIndex` with ``BDay`` freq instead (:issue:`53446`)
- Deprecated :func:`value_counts`, use ``pd.Series(obj).value_counts()`` instead (:issue:`47862`)
-- Deprecated :meth:`Series.first` and :meth:`DataFrame.first` (please create a mask and filter using ``.loc`` instead) (:issue:`45908`)
+- Deprecated :meth:`Series.first` and :meth:`DataFrame.first`; create a mask and filter using ``.loc`` instead (:issue:`45908`)
- Deprecated :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for object-dtype (:issue:`53631`)
-- Deprecated :meth:`Series.last` and :meth:`DataFrame.last` (please create a mask and filter using ``.loc`` instead) (:issue:`53692`)
+- Deprecated :meth:`Series.last` and :meth:`DataFrame.last`; create a mask and filter using ``.loc`` instead (:issue:`53692`)
- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
- Deprecated allowing bool dtype in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile`, consistent with the :meth:`Series.quantile` and :meth:`DataFrame.quantile` behavior (:issue:`51424`)
- Deprecated behavior of :func:`.testing.assert_series_equal` and :func:`.testing.assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`)
-- Deprecated bytes input to :func:`read_excel`. To read a file path, use a string or path-like object. (:issue:`53767`)
-- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
+- Deprecated bytes input to :func:`read_excel`. To read a file path, use a string or path-like object (:issue:`53767`)
+- Deprecated constructing :class:`.SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
- Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`)
-- Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`)
-- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`)
-- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`)
-- Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`)
+- Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead (:issue:`53409`)
+- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead (:issue:`53767`)
+- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead (:issue:`53767`)
+- Deprecated option ``mode.use_inf_as_na``, convert inf entries to ``NaN`` before instead (:issue:`51684`)
- Deprecated parameter ``obj`` in :meth:`.DataFrameGroupBy.get_group` (:issue:`53545`)
- Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)
- Deprecated replacing builtin and NumPy functions in ``.agg``, ``.apply``, and ``.transform``; use the corresponding string alias (e.g. ``"sum"`` for ``sum`` or ``np.sum``) instead (:issue:`53425`)
- Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`)
-- Deprecated the "method" and "limit" keywords in ``ExtensionArray.fillna``, implement and use ``pad_or_backfill`` instead (:issue:`53621`)
-- Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`.SeriesGroupBy.fillna`, :meth:`.DataFrameGroupBy.fillna`, and :meth:`.Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`)
+- Deprecated the "method" and "limit" keywords in ``.ExtensionArray.fillna``, implement ``_pad_or_backfill`` instead (:issue:`53621`)
- Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`)
+- Deprecated the ``method`` and ``limit`` keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`.SeriesGroupBy.fillna`, :meth:`.DataFrameGroupBy.fillna`, and :meth:`.Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`)
- Deprecated the behavior of :meth:`Series.__getitem__`, :meth:`Series.__setitem__`, :meth:`DataFrame.__getitem__`, :meth:`DataFrame.__setitem__` with an integer slice on objects with a floating-dtype index, in a future version this will be treated as *positional* indexing (:issue:`49612`)
- Deprecated the use of non-supported datetime64 and timedelta64 resolutions with :func:`pandas.array`. Supported resolutions are: "s", "ms", "us", "ns" resolutions (:issue:`53058`)
-- Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`)
-- Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and skipna=True or any-NAs and skipna=False returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`)
-- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name``. (:issue:`54229`)
+- Deprecated values ``"pad"``, ``"ffill"``, ``"bfill"``, ``"backfill"`` for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`)
+- Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and ``skipna=True`` or any-NAs and ``skipna=False`` returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name`` and ``con`` (:issue:`54229`)
+- Deprecated silently ignoring ``fill_value`` when passing both ``freq`` and ``fill_value`` to :meth:`DataFrame.shift`, :meth:`Series.shift` and :meth:`.DataFrameGroupBy.shift`; in a future version this will raise ``ValueError`` (:issue:`53832`)
.. ---------------------------------------------------------------------------
.. _whatsnew_210.performance:
@@ -596,7 +599,7 @@ Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
-- Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`)
+- Performance improvement in :func:`read_orc` when reading a remote URI file path (:issue:`51609`)
- Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`)
- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
@@ -611,9 +614,10 @@ Performance improvements
- Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
- Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
- Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`)
-- :class:`Period`'s default formatter (`period_format`) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`Period.strftime(fmt=None)`, as well as ``PeriodArray.strftime(fmt=None)``, ``PeriodIndex.strftime(fmt=None)`` and ``PeriodIndex.format(fmt=None)``. Finally, ``to_csv`` operations involving :class:`PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated. (:issue:`51459`)
+- :class:`Period`'s default formatter (``period_format``) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`.Period.strftime(fmt=None)`, as well as ``.PeriodArray.strftime(fmt=None)``, ``.PeriodIndex.strftime(fmt=None)`` and ``.PeriodIndex.format(fmt=None)``. ``to_csv`` operations involving :class:`.PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated (:issue:`51459`)
- Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`)
-- Performance improvement for :class:`DataFrameGroupBy`/:class:`SeriesGroupBy` aggregations (e.g. :meth:`DataFrameGroupBy.sum`) with ``engine="numba"`` (:issue:`53731`)
+- Performance improvement for :class:`.DataFrameGroupBy`/:class:`.SeriesGroupBy` aggregations (e.g. :meth:`.DataFrameGroupBy.sum`) with ``engine="numba"`` (:issue:`53731`)
+- Performance improvement in :class:`DataFrame` reductions with ``axis=1`` and extension dtypes (:issue:`54341`)
- Performance improvement in :class:`DataFrame` reductions with ``axis=None`` and extension dtypes (:issue:`54308`)
- Performance improvement in :class:`MultiIndex` and multi-column operations (e.g. :meth:`DataFrame.sort_values`, :meth:`DataFrame.groupby`, :meth:`Series.unstack`) when index/column values are already sorted (:issue:`53806`)
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
@@ -621,24 +625,26 @@ Performance improvements
- Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`)
- Performance improvement in :func:`merge` for PyArrow backed strings (:issue:`54443`)
- Performance improvement in :func:`read_csv` with ``engine="c"`` (:issue:`52632`)
+- Performance improvement in :meth:`.ArrowExtensionArray.to_numpy` (:issue:`52525`)
- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
- Performance improvement in :meth:`DataFrame.astype` when ``dtype`` is an extension dtype (:issue:`54299`)
+- Performance improvement in :meth:`DataFrame.iloc` when input is an single integer and dataframe is backed by extension dtypes (:issue:`54508`)
- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`)
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
+- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single PyArrow dtype (:issue:`54224`)
- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`)
-- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single pyarrow dtype (:issue:`54224`)
-- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`)
+- Performance improvement in :meth:`Series.add` for PyArrow string and binary dtypes (:issue:`53150`)
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
-- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`)
-- Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`)
-- Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`)
-- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`)
-- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`)
-- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`)
-- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`)
+- Performance improvement in :meth:`Series.drop_duplicates` for ``ArrowDtype`` (:issue:`54667`).
+- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with PyArrow dtypes (:issue:`53950`)
+- Performance improvement in :meth:`Series.str.get_dummies` for PyArrow-backed strings (:issue:`53655`)
+- Performance improvement in :meth:`Series.str.get` for PyArrow-backed strings (:issue:`53152`)
+- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for PyArrow-backed strings (:issue:`53585`)
+- Performance improvement in :meth:`Series.to_numpy` when dtype is a NumPy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`)
+- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a PyArrow timestamp or duration dtype to NumPy (:issue:`53326`)
- Performance improvement in various :class:`MultiIndex` set and indexing operations (:issue:`53955`)
-- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
-- Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`)
+- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArray` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
+- Performance improvement when indexing with PyArrow timestamp and duration dtypes (:issue:`53368`)
- Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`)
.. ---------------------------------------------------------------------------
@@ -655,28 +661,29 @@ Categorical
Datetimelike
^^^^^^^^^^^^
-- :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected. (:issue:`51644`)
-- :meth:`DatetimeIndex.slice_indexer` now raises ``KeyError`` for non-monotonic indexes if either of the slice bounds is not in the index, this behaviour was previously deprecated but inconsistently handled. (:issue:`53983`)
+- :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`)
+- :meth:`DatetimeIndex.slice_indexer` now raises ``KeyError`` for non-monotonic indexes if either of the slice bounds is not in the index; this behaviour was previously deprecated but inconsistently handled (:issue:`53983`)
- Bug in :class:`DateOffset` which had inconsistent behavior when multiplying a :class:`DateOffset` object by a constant (:issue:`47953`)
- Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`)
-- Bug in :func:`to_datetime` converting :class:`Series` or :class:`DataFrame` containing :class:`arrays.ArrowExtensionArray` of ``pyarrow`` timestamps to numpy datetimes (:issue:`52545`)
-- Bug in :meth:`DataFrame.to_sql` raising ``ValueError`` for pyarrow-backed date like dtypes (:issue:`53854`)
+- Bug in :func:`to_datetime` converting :class:`Series` or :class:`DataFrame` containing :class:`arrays.ArrowExtensionArray` of PyArrow timestamps to numpy datetimes (:issue:`52545`)
+- Bug in :meth:`.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
+- Bug in :meth:`DataFrame.to_sql` raising ``ValueError`` for PyArrow-backed date like dtypes (:issue:`53854`)
- Bug in :meth:`Timestamp.date`, :meth:`Timestamp.isocalendar`, :meth:`Timestamp.timetuple`, and :meth:`Timestamp.toordinal` were returning incorrect results for inputs outside those supported by the Python standard library's datetime module (:issue:`53668`)
- Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`)
-- Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
- Bug in constructing a :class:`Series` or :class:`DataFrame` from a datetime or timedelta scalar always inferring nanosecond resolution instead of inferring from the input (:issue:`52212`)
- Bug in constructing a :class:`Timestamp` from a string representing a time without a date inferring an incorrect unit (:issue:`54097`)
- Bug in constructing a :class:`Timestamp` with ``ts_input=pd.NA`` raising ``TypeError`` (:issue:`45481`)
- Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`)
+- Bug in the repr for :class:`Series` when dtype is a timezone aware datetime with non-nanosecond resolution raising ``OutOfBoundsDatetime`` (:issue:`54623`)
Timedelta
^^^^^^^^^
-- :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`)
- Bug in :class:`TimedeltaIndex` division or multiplication leading to ``.freq`` of "0 Days" instead of ``None`` (:issue:`51575`)
-- Bug in :class:`Timedelta` with Numpy timedelta64 objects not properly raising ``ValueError`` (:issue:`52806`)
-- Bug in :func:`to_timedelta` converting :class:`Series` or :class:`DataFrame` containing :class:`ArrowDtype` of ``pyarrow.duration`` to numpy ``timedelta64`` (:issue:`54298`)
+- Bug in :class:`Timedelta` with NumPy ``timedelta64`` objects not properly raising ``ValueError`` (:issue:`52806`)
+- Bug in :func:`to_timedelta` converting :class:`Series` or :class:`DataFrame` containing :class:`ArrowDtype` of ``pyarrow.duration`` to NumPy ``timedelta64`` (:issue:`54298`)
- Bug in :meth:`Timedelta.__hash__`, raising an ``OutOfBoundsTimedelta`` on certain large values of second resolution (:issue:`54037`)
- Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`)
+- Bug in :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` (:issue:`51644`)
- Bug in :meth:`arrays.TimedeltaArray.map` and :meth:`TimedeltaIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
Timezones
@@ -688,10 +695,10 @@ Numeric
^^^^^^^
- Bug in :class:`RangeIndex` setting ``step`` incorrectly when being the subtrahend with minuend a numeric value (:issue:`53255`)
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
-- Bug when calling :meth:`Series.kurt` and :meth:`Series.skew` on numpy data of all zero returning a python type instead of a numpy type (:issue:`53482`)
+- Bug when calling :meth:`Series.kurt` and :meth:`Series.skew` on NumPy data of all zero returning a Python type instead of a NumPy type (:issue:`53482`)
- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`)
-- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`)
-- Bug in :meth:`DataFrame.size` and :meth:`Series.size` returning 64-bit integer instead of int (:issue:`52897`)
+- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for PyArrow-backed dtypes (:issue:`52314`)
+- Bug in :meth:`DataFrame.size` and :meth:`Series.size` returning 64-bit integer instead of a Python int (:issue:`52897`)
- Bug in :meth:`DateFrame.dot` returning ``object`` dtype for :class:`ArrowDtype` data (:issue:`53979`)
- Bug in :meth:`Series.any`, :meth:`Series.all`, :meth:`DataFrame.any`, and :meth:`DataFrame.all` had the default value of ``bool_only`` set to ``None`` instead of ``False``; this change should have no impact on users (:issue:`53258`)
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
@@ -702,9 +709,9 @@ Numeric
Conversion
^^^^^^^^^^
- Bug in :func:`DataFrame.style.to_latex` and :func:`DataFrame.style.to_html` if the DataFrame contains integers with more digits than can be represented by floating point double precision (:issue:`52272`)
-- Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`NumpyExtensionArray` instead of :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`52859`)
-- Bug in :func:`array` when given an empty list and no dtype returning :class:`NumpyExtensionArray` instead of :class:`FloatingArray` (:issue:`54371`)
-- Bug in :meth:`ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`)
+- Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`.NumpyExtensionArray` instead of :class:`.DatetimeArray` or :class:`.TimedeltaArray` (:issue:`52859`)
+- Bug in :func:`array` when given an empty list and no dtype returning :class:`.NumpyExtensionArray` instead of :class:`.FloatingArray` (:issue:`54371`)
+- Bug in :meth:`.ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`)
- Bug in :meth:`DataFrame.__repr__` incorrectly raising a ``TypeError`` when the dtype of a column is ``np.record`` (:issue:`48526`)
- Bug in :meth:`DataFrame.info` raising ``ValueError`` when ``use_numba`` is set (:issue:`51922`)
- Bug in :meth:`DataFrame.insert` raising ``TypeError`` if ``loc`` is ``np.int64`` (:issue:`53193`)
@@ -714,6 +721,7 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.str` that did not raise a ``TypeError`` when iterated (:issue:`54173`)
+- Bug in ``repr`` for :class:`DataFrame`` with string-dtype columns (:issue:`54797`)
Interval
^^^^^^^^
@@ -726,13 +734,14 @@ Indexing
- Bug in :meth:`DataFrame.__setitem__` losing dtype when setting a :class:`DataFrame` into duplicated columns (:issue:`53143`)
- Bug in :meth:`DataFrame.__setitem__` with a boolean mask and :meth:`DataFrame.putmask` with mixed non-numeric dtypes and a value other than ``NaN`` incorrectly raising ``TypeError`` (:issue:`53291`)
- Bug in :meth:`DataFrame.iloc` when using ``nan`` as the only element (:issue:`52234`)
+- Bug in :meth:`Series.loc` casting :class:`Series` to ``np.dnarray`` when assigning :class:`Series` at predefined index of ``object`` dtype :class:`Series` (:issue:`48933`)
Missing
^^^^^^^
-- Bug in :meth:`DataFrame.interpolate` failing to fill across multiblock data when ``method`` is "pad", "ffill", "bfill", or "backfill" (:issue:`53898`)
+- Bug in :meth:`DataFrame.interpolate` failing to fill across data when ``method`` is ``"pad"``, ``"ffill"``, ``"bfill"``, or ``"backfill"`` (:issue:`53898`)
- Bug in :meth:`DataFrame.interpolate` ignoring ``inplace`` when :class:`DataFrame` is empty (:issue:`53199`)
- Bug in :meth:`Series.idxmin`, :meth:`Series.idxmax`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax` with a :class:`DatetimeIndex` index containing ``NaT`` incorrectly returning ``NaN`` instead of ``NaT`` (:issue:`43587`)
-- Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` failing to raise on invalid ``downcast`` keyword, which can be only ``None`` or "infer" (:issue:`53103`)
+- Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` failing to raise on invalid ``downcast`` keyword, which can be only ``None`` or ``"infer"`` (:issue:`53103`)
- Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` with complex dtype incorrectly failing to fill ``NaN`` entries (:issue:`53635`)
MultiIndex
@@ -744,25 +753,23 @@ I/O
^^^
- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
- :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`)
-- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`)
-- Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`)
+- Bug in :func:`json_normalize` could not parse metadata fields list type (:issue:`37782`)
- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
-- Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`)
-- Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
-- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
-- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
+- Bug in :func:`read_csv` with ``engine="pyarrow"`` raising when specifying a ``dtype`` with ``index_col`` (:issue:`53229`)
+- Bug in :func:`read_hdf` not properly closing store after an ``IndexError`` is raised (:issue:`52781`)
+- Bug in :func:`read_html` where style elements were read into DataFrames (:issue:`52197`)
+- Bug in :func:`read_html` where tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
- Bug in :func:`read_sql_table` raising an exception when reading a view (:issue:`52969`)
- Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`)
- Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`)
- Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`)
- Bug in :meth:`DataFrame.to_html` where conversion for an empty :class:`DataFrame` with complex dtype raised a ``ValueError`` (:issue:`54167`)
-- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`)
+- Bug in :meth:`DataFrame.to_json` where :class:`.DateTimeArray`/:class:`.DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`)
- Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`)
- Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`)
Period
^^^^^^
-- :meth:`PeriodIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`)
- Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`)
- Bug in :class:`PeriodDtype` constructor incorrectly returning the same ``normalize`` for different :class:`DateOffset` ``freq`` inputs (:issue:`24121`)
- Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`)
@@ -770,6 +777,7 @@ Period
- Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087`)
- Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087`)
- Bug in :meth:`Period.now` not accepting the ``freq`` parameter as a keyword argument (:issue:`53369`)
+- Bug in :meth:`PeriodIndex.map` with ``na_action="ignore"`` (:issue:`51644`)
- Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
- Bug in incorrectly allowing construction of :class:`Period` or :class:`PeriodDtype` with :class:`CustomBusinessDay` freq; use :class:`BusinessDay` instead (:issue:`52534`)
@@ -780,29 +788,30 @@ Plotting
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
-- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`)
-- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` :class:`Datetimelike` ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`)
+- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` returns wrong dtype when used on an empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`)
+- Bug in :meth:`DataFrame.groupby.rank` on nullable datatypes when passing ``na_option="bottom"`` or ``na_option="top"`` (:issue:`54206`)
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`)
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`)
+- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`)
- Bug in weighted rolling aggregations when specifying ``min_periods=0`` (:issue:`51449`)
-- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby`, where, when the index of the
+- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where, when the index of the
grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex`
or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument,
- the function operated on the whole index rather than each element of the index. (:issue:`51979`)
+ the function operated on the whole index rather than each element of the index (:issue:`51979`)
- Bug in :meth:`.DataFrameGroupBy.agg` with lists not respecting ``as_index=False`` (:issue:`52849`)
-- Bug in :meth:`.DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`)
+- Bug in :meth:`.DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same (:issue:`52444`)
- Bug in :meth:`.DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`)
-- Bug in :meth:`.GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`)
-- Bug in :meth:`.GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`)
+- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` with a datetime key in conjunction with another key produced an incorrect number of group keys (:issue:`51158`)
+- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`)
- Bug in :meth:`.SeriesGroupBy.size` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`)
-- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning names as tuples when grouping by a list of a single element. (:issue:`53500`)
-- Bug in :meth:`.GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`)
-- Bug in :meth:`.DataFrameGroupby.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`)
+- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning names as tuples when grouping by a list consisting of a single element (:issue:`53500`)
+- Bug in :meth:`.DataFrameGroupBy.var` and :meth:`.SeriesGroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`)
+- Bug in :meth:`.DataFrameGroupBy.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`)
- Bug in :meth:`.Resampler.ohlc` with empty object returning a :class:`Series` instead of empty :class:`DataFrame` (:issue:`42902`)
- Bug in :meth:`.SeriesGroupBy.count` and :meth:`.DataFrameGroupBy.count` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`)
- Bug in :meth:`.SeriesGroupBy.nth` and :meth:`.DataFrameGroupBy.nth` after performing column selection when using ``dropna="any"`` or ``dropna="all"`` would not subset columns (:issue:`53518`)
- Bug in :meth:`.SeriesGroupBy.nth` and :meth:`.DataFrameGroupBy.nth` raised after performing column selection when using ``dropna="any"`` or ``dropna="all"`` resulted in rows being dropped (:issue:`53518`)
-- Bug in :meth:`.SeriesGroupBy.sum` and :meth:`.DataFrameGroupby.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` (:issue:`53606`)
+- Bug in :meth:`.SeriesGroupBy.sum` and :meth:`.DataFrameGroupBy.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` instead of ``np.inf`` and ``-np.inf`` respectively (:issue:`53606`)
- Bug in :meth:`Series.groupby` raising an error when grouped :class:`Series` has a :class:`DatetimeIndex` index and a :class:`Series` with a name that is a month is given to the ``by`` argument (:issue:`48509`)
Reshaping
@@ -813,6 +822,7 @@ Reshaping
- Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`)
- Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`)
- Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`)
+- Bug in :func:`merge` when merging on integer ``ExtensionDtype`` and float NumPy dtype raising ``TypeError`` (:issue:`46178`)
- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
- Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`)
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
@@ -825,23 +835,26 @@ Reshaping
Sparse
^^^^^^
-- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a ``numpy`` dtype (:issue:`53160`)
+- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a NumPy dtype (:issue:`53160`)
- Bug in :meth:`arrays.SparseArray.map` allowed the fill value to be included in the sparse values (:issue:`52095`)
ExtensionArray
^^^^^^^^^^^^^^
-- Bug in :class:`ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`)
+- Bug in :class:`.ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`)
- Bug in :class:`DataFrame` constructor not copying :class:`Series` with extension dtype when given in dict (:issue:`53744`)
- Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`)
-- Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`)
+- Bug in :meth:`Series.quantile` for PyArrow temporal types raising ``ArrowInvalid`` (:issue:`52678`)
- Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`)
+- Bug in :meth:`Series.unique` for boolean ``ArrowDtype`` with ``NA`` values (:issue:`54667`)
- Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`)
-- Bug where the :class:`DataFrame` repr would not work when a column would have an :class:`ArrowDtype` with an ``pyarrow.ExtensionDtype`` (:issue:`54063`)
-- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`)
+- Bug in :meth:`~arrays.ArrowExtensionArray.factorize` returning incorrect uniques for a ``pyarrow.dictionary`` type ``pyarrow.chunked_array`` with more than one chunk (:issue:`54844`)
+- Bug when passing an :class:`ExtensionArray` subclass to ``dtype`` keywords. This will now raise a ``UserWarning`` to encourage passing an instance instead (:issue:`31356`, :issue:`54592`)
+- Bug where the :class:`DataFrame` repr would not work when a column had an :class:`ArrowDtype` with a ``pyarrow.ExtensionDtype`` (:issue:`54063`)
+- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes (e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept PyArrow arrays of type ``pyarrow.null()`` (:issue:`52223`)
Styler
^^^^^^
-- Bug in :meth:`Styler._copy` calling overridden methods in subclasses of :class:`Styler` (:issue:`52728`)
+- Bug in :meth:`.Styler._copy` calling overridden methods in subclasses of :class:`.Styler` (:issue:`52728`)
Metadata
^^^^^^^^
@@ -851,21 +864,20 @@ Metadata
Other
^^^^^
+- Bug in :class:`.FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`)
- Bug in :class:`DataFrame` and :class:`Series` raising for data of complex dtype when ``NaN`` values are present (:issue:`53627`)
- Bug in :class:`DatetimeIndex` where ``repr`` of index passed with time does not print time is midnight and non-day based freq(:issue:`53470`)
-- Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`)
-- Bug in :func:`.testing.assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`)
+- Bug in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` now throw assertion error for two unequal sets (:issue:`51727`)
- Bug in :func:`.testing.assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`)
- Bug in :func:`api.interchange.from_dataframe` was not respecting ``allow_copy`` argument (:issue:`54322`)
- Bug in :func:`api.interchange.from_dataframe` was raising during interchanging from non-pandas tz-aware data containing null values (:issue:`54287`)
- Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`)
- Bug in :func:`from_dummies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`54300`)
- Bug in :func:`from_dummies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`54300`)
+- Bug in :meth:`.DataFrameGroupBy.first`, :meth:`.DataFrameGroupBy.last`, :meth:`.SeriesGroupBy.first`, and :meth:`.SeriesGroupBy.last` where an empty group would return ``np.nan`` instead of the corresponding :class:`.ExtensionArray` NA value (:issue:`39098`)
- Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`)
- Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`)
-- Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`)
- Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`)
-- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where an empty group would return ``np.nan`` instead of a an ExtensionArray's NA value (:issue:`39098`)
- Bug in :meth:`Index.sort_values` when a ``key`` is passed (:issue:`52764`)
- Bug in :meth:`Series.align`, :meth:`DataFrame.align`, :meth:`Series.reindex`, :meth:`DataFrame.reindex`, :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, incorrectly failing to raise with method="asfreq" (:issue:`53620`)
- Bug in :meth:`Series.argsort` failing to raise when an invalid ``axis`` is passed (:issue:`54257`)
@@ -879,3 +891,5 @@ Other
Contributors
~~~~~~~~~~~~
+
+.. contributors:: v2.0.3..v2.1.0
diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst
new file mode 100644
index 0000000000000..6101333ae760c
--- /dev/null
+++ b/doc/source/whatsnew/v2.1.1.rst
@@ -0,0 +1,55 @@
+.. _whatsnew_211:
+
+What's new in 2.1.1 (September 20, 2023)
+----------------------------------------
+
+These are the changes in pandas 2.1.1. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_211.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`)
+- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
+- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
+- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
+- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`)
+- Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`)
+- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`)
+- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`)
+- Fixed regression in :meth:`DataFrameGroupBy.agg` when aggregating a DataFrame with duplicate column names using a dictionary (:issue:`55006`)
+- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`)
+- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)
+- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`)
+- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`)
+- Fixed regression in comparison operations for PyArrow backed columns not propagating exceptions correctly (:issue:`54944`)
+- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_211.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`)
+- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`)
+- Fixed bug in :meth:`Series.dt.tz` with :class:`ArrowDtype` where a string was returned instead of a ``tzinfo`` object (:issue:`55003`)
+- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_211.other:
+
+Other
+~~~~~
+- Reverted the deprecation that disallowed :meth:`Series.apply` returning a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object (:issue:`52116`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_211.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v2.1.0..v2.1.1
diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
new file mode 100644
index 0000000000000..a13a273b01257
--- /dev/null
+++ b/doc/source/whatsnew/v2.1.2.rst
@@ -0,0 +1,70 @@
+.. _whatsnew_212:
+
+What's new in 2.1.2 (October 26, 2023)
+---------------------------------------
+
+These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_212.deprecations:
+
+Deprecations
+~~~~~~~~~~~~
+
+- Reverted deprecation of ``fill_method=None`` in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`DataFrameGroupBy.pct_change`, and :meth:`SeriesGroupBy.pct_change`; the values ``'backfill'``, ``'bfill'``, ``'pad'``, and ``'ffill'`` are still deprecated (:issue:`53491`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_212.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
+- Fixed regression in :meth:`~DataFrame.rolling` where non-nanosecond index or ``on`` column would produce incorrect results (:issue:`55026`, :issue:`55106`, :issue:`55299`)
+- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
+- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`)
+- Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`)
+- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`)
+- Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`)
+- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`)
+- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`)
+- Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_212.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`)
+- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`)
+- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
+- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
+- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
+- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
+- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
+- Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`)
+- Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`)
+- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`)
+- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
+- Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`)
+- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)
+- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
+- Fixed bug in :class:`Series` constructor not inferring string dtype when ``NA`` is the first value and ``infer_string`` is set (:issue:` 55655`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_212.other:
+
+Other
+~~~~~
+- Fixed non-working installation of optional dependency group ``output_formatting``. Replacing underscore ``_`` with a dash ``-`` fixes broken dependency resolution. A correct way to use now is ``pip install pandas[output-formatting]``.
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_212.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v2.1.1..v2.1.2
diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst
new file mode 100644
index 0000000000000..af626895a9e0e
--- /dev/null
+++ b/doc/source/whatsnew/v2.1.3.rst
@@ -0,0 +1,33 @@
+.. _whatsnew_213:
+
+What's new in 2.1.3 (November 10, 2023)
+---------------------------------------
+
+These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_213.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+- Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_213.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`)
+- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`)
+- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_213.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v2.1.2..v2.1.3|HEAD
diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst
new file mode 100644
index 0000000000000..73b1103c1bd37
--- /dev/null
+++ b/doc/source/whatsnew/v2.1.4.rst
@@ -0,0 +1,45 @@
+.. _whatsnew_214:
+
+What's new in 2.1.4 (December 8, 2023)
+---------------------------------------
+
+These are the changes in pandas 2.1.4. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_214.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+- Fixed regression when trying to read a pickled pandas :class:`DataFrame` from pandas 1.3 (:issue:`55137`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_214.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+- Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`)
+- Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`)
+- Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`)
+- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
+- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`)
+- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
+- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
+- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
+- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
+- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
+- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`)
+- Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`)
+- Fixed bug in :meth:`Series.reset_index` not preserving object dtype when ``infer_string`` is set (:issue:`56160`)
+- Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
+- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_214.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v2.1.3..v2.1.4
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
new file mode 100644
index 0000000000000..d9ab0452c8334
--- /dev/null
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -0,0 +1,949 @@
+.. _whatsnew_220:
+
+What's new in 2.2.0 (January 19, 2024)
+--------------------------------------
+
+These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_220.upcoming_changes:
+
+Upcoming changes in pandas 3.0
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+pandas 3.0 will bring two bigger changes to the default behavior of pandas.
+
+Copy-on-Write
+^^^^^^^^^^^^^
+
+The currently optional mode Copy-on-Write will be enabled by default in pandas 3.0. There
+won't be an option to keep the current behavior enabled. The new behavioral semantics are
+explained in the :ref:`user guide about Copy-on-Write `.
+
+The new behavior can be enabled since pandas 2.0 with the following option:
+
+.. code-block:: ipython
+
+ pd.options.mode.copy_on_write = True
+
+This change brings different changes in behavior in how pandas operates with respect to
+copies and views. Some of these changes allow a clear deprecation, like the changes in
+chained assignment. Other changes are more subtle and thus, the warnings are hidden behind
+an option that can be enabled in pandas 2.2.
+
+.. code-block:: ipython
+
+ pd.options.mode.copy_on_write = "warn"
+
+This mode will warn in many different scenarios that aren't actually relevant to
+most queries. We recommend exploring this mode, but it is not necessary to get rid
+of all of these warnings. The :ref:`migration guide `
+explains the upgrade process in more detail.
+
+Dedicated string data type (backed by Arrow) by default
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Historically, pandas represented string columns with NumPy object data type. This
+representation has numerous problems, including slow performance and a large memory
+footprint. This will change in pandas 3.0. pandas will start inferring string columns
+as a new ``string`` data type, backed by Arrow, which represents strings contiguous in memory. This brings
+a huge performance and memory improvement.
+
+Old behavior:
+
+.. code-block:: ipython
+
+ In [1]: ser = pd.Series(["a", "b"])
+ Out[1]:
+ 0 a
+ 1 b
+ dtype: object
+
+New behavior:
+
+
+.. code-block:: ipython
+
+ In [1]: ser = pd.Series(["a", "b"])
+ Out[1]:
+ 0 a
+ 1 b
+ dtype: string
+
+The string data type that is used in these scenarios will mostly behave as NumPy
+object would, including missing value semantics and general operations on these
+columns.
+
+This change includes a few additional changes across the API:
+
+- Currently, specifying ``dtype="string"`` creates a dtype that is backed by Python strings
+ which are stored in a NumPy array. This will change in pandas 3.0, this dtype
+ will create an Arrow backed string column.
+- The column names and the Index will also be backed by Arrow strings.
+- PyArrow will become a required dependency with pandas 3.0 to accommodate this change.
+
+This future dtype inference logic can be enabled with:
+
+.. code-block:: ipython
+
+ pd.options.future.infer_string = True
+
+.. _whatsnew_220.enhancements:
+
+Enhancements
+~~~~~~~~~~~~
+
+.. _whatsnew_220.enhancements.adbc_support:
+
+ADBC Driver support in to_sql and read_sql
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`read_sql` and :meth:`~DataFrame.to_sql` now work with `Apache Arrow ADBC
+`_ drivers. Compared to
+traditional drivers used via SQLAlchemy, ADBC drivers should provide
+significant performance improvements, better type support and cleaner
+nullability handling.
+
+.. code-block:: ipython
+
+ import adbc_driver_postgresql.dbapi as pg_dbapi
+
+ df = pd.DataFrame(
+ [
+ [1, 2, 3],
+ [4, 5, 6],
+ ],
+ columns=['a', 'b', 'c']
+ )
+ uri = "postgresql://postgres:postgres@localhost/postgres"
+ with pg_dbapi.connect(uri) as conn:
+ df.to_sql("pandas_table", conn, index=False)
+
+ # for round-tripping
+ with pg_dbapi.connect(uri) as conn:
+ df2 = pd.read_sql("pandas_table", conn)
+
+The Arrow type system offers a wider array of types that can more closely match
+what databases like PostgreSQL can offer. To illustrate, note this (non-exhaustive)
+listing of types available in different databases and pandas backends:
+
++-----------------+-----------------------+----------------+---------+
+|numpy/pandas |arrow |postgres |sqlite |
++=================+=======================+================+=========+
+|int16/Int16 |int16 |SMALLINT |INTEGER |
++-----------------+-----------------------+----------------+---------+
+|int32/Int32 |int32 |INTEGER |INTEGER |
++-----------------+-----------------------+----------------+---------+
+|int64/Int64 |int64 |BIGINT |INTEGER |
++-----------------+-----------------------+----------------+---------+
+|float32 |float32 |REAL |REAL |
++-----------------+-----------------------+----------------+---------+
+|float64 |float64 |DOUBLE PRECISION|REAL |
++-----------------+-----------------------+----------------+---------+
+|object |string |TEXT |TEXT |
++-----------------+-----------------------+----------------+---------+
+|bool |``bool_`` |BOOLEAN | |
++-----------------+-----------------------+----------------+---------+
+|datetime64[ns] |timestamp(us) |TIMESTAMP | |
++-----------------+-----------------------+----------------+---------+
+|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | |
++-----------------+-----------------------+----------------+---------+
+| |date32 |DATE | |
++-----------------+-----------------------+----------------+---------+
+| |month_day_nano_interval|INTERVAL | |
++-----------------+-----------------------+----------------+---------+
+| |binary |BINARY |BLOB |
++-----------------+-----------------------+----------------+---------+
+| |decimal128 |DECIMAL [#f1]_ | |
++-----------------+-----------------------+----------------+---------+
+| |list |ARRAY [#f1]_ | |
++-----------------+-----------------------+----------------+---------+
+| |struct |COMPOSITE TYPE | |
+| | | [#f1]_ | |
++-----------------+-----------------------+----------------+---------+
+
+.. rubric:: Footnotes
+
+.. [#f1] Not implemented as of writing, but theoretically possible
+
+If you are interested in preserving database types as best as possible
+throughout the lifecycle of your DataFrame, users are encouraged to
+leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql`
+
+.. code-block:: ipython
+
+ # for round-tripping
+ with pg_dbapi.connect(uri) as conn:
+ df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow")
+
+This will prevent your data from being converted to the traditional pandas/NumPy
+type system, which often converts SQL types in ways that make them impossible to
+round-trip.
+
+For a full list of ADBC drivers and their development status, see the `ADBC Driver
+Implementation Status `_
+documentation.
+
+.. _whatsnew_220.enhancements.case_when:
+
+Create a pandas Series based on one or more conditions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`)
+
+.. ipython:: python
+
+ import pandas as pd
+
+ df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6]))
+ default=pd.Series('default', index=df.index)
+ default.case_when(
+ caselist=[
+ (df.a == 1, 'first'), # condition, replacement
+ (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement
+ ],
+ )
+
+.. _whatsnew_220.enhancements.to_numpy_ea:
+
+``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``to_numpy`` for NumPy nullable and Arrow types will now convert to a
+suitable NumPy dtype instead of ``object`` dtype for nullable and PyArrow backed extension dtypes.
+
+*Old behavior:*
+
+.. code-block:: ipython
+
+ In [1]: ser = pd.Series([1, 2, 3], dtype="Int64")
+ In [2]: ser.to_numpy()
+ Out[2]: array([1, 2, 3], dtype=object)
+
+*New behavior:*
+
+.. ipython:: python
+
+ ser = pd.Series([1, 2, 3], dtype="Int64")
+ ser.to_numpy()
+
+ ser = pd.Series([1, 2, 3], dtype="timestamp[ns][pyarrow]")
+ ser.to_numpy()
+
+The default NumPy dtype (without any arguments) is determined as follows:
+
+- float dtypes are cast to NumPy floats
+- integer dtypes without missing values are cast to NumPy integer dtypes
+- integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator
+- boolean dtypes without missing values are cast to NumPy bool dtype
+- boolean dtypes with missing values keep object dtype
+- datetime and timedelta types are cast to Numpy datetime64 and timedelta64 types respectively and ``NaT`` is used as missing value indicator
+
+.. _whatsnew_220.enhancements.struct_accessor:
+
+Series.struct accessor for PyArrow structured data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``Series.struct`` accessor provides attributes and methods for processing
+data with ``struct[pyarrow]`` dtype Series. For example,
+:meth:`Series.struct.explode` converts PyArrow structured data to a pandas
+DataFrame. (:issue:`54938`)
+
+.. ipython:: python
+
+ import pyarrow as pa
+ series = pd.Series(
+ [
+ {"project": "pandas", "version": "2.2.0"},
+ {"project": "numpy", "version": "1.25.2"},
+ {"project": "pyarrow", "version": "13.0.0"},
+ ],
+ dtype=pd.ArrowDtype(
+ pa.struct([
+ ("project", pa.string()),
+ ("version", pa.string()),
+ ])
+ ),
+ )
+ series.struct.explode()
+
+Use :meth:`Series.struct.field` to index into a (possible nested)
+struct field.
+
+
+.. ipython:: python
+
+ series.struct.field("project")
+
+.. _whatsnew_220.enhancements.list_accessor:
+
+Series.list accessor for PyArrow list data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``Series.list`` accessor provides attributes and methods for processing
+data with ``list[pyarrow]`` dtype Series. For example,
+:meth:`Series.list.__getitem__` allows indexing pyarrow lists in
+a Series. (:issue:`55323`)
+
+.. ipython:: python
+
+ import pyarrow as pa
+ series = pd.Series(
+ [
+ [1, 2, 3],
+ [4, 5],
+ [6],
+ ],
+ dtype=pd.ArrowDtype(
+ pa.list_(pa.int64())
+ ),
+ )
+ series.list[0]
+
+.. _whatsnew_220.enhancements.calamine:
+
+Calamine engine for :func:`read_excel`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``calamine`` engine was added to :func:`read_excel`.
+It uses ``python-calamine``, which provides Python bindings for the Rust library `calamine `__.
+This engine supports Excel files (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) and OpenDocument spreadsheets (``.ods``) (:issue:`50395`).
+
+There are two advantages of this engine:
+
+1. Calamine is often faster than other engines, some benchmarks show results up to 5x faster than 'openpyxl', 20x - 'odf', 4x - 'pyxlsb', and 1.5x - 'xlrd'.
+ But, 'openpyxl' and 'pyxlsb' are faster in reading a few rows from large files because of lazy iteration over rows.
+2. Calamine supports the recognition of datetime in ``.xlsb`` files, unlike 'pyxlsb' which is the only other engine in pandas that can read ``.xlsb`` files.
+
+.. code-block:: python
+
+ pd.read_excel("path_to_file.xlsb", engine="calamine")
+
+
+For more, see :ref:`io.calamine` in the user guide on IO tools.
+
+.. _whatsnew_220.enhancements.other:
+
+Other enhancements
+^^^^^^^^^^^^^^^^^^
+
+- :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend
+- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
+- :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`)
+- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"`` (:issue:`54480`)
+- :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`)
+- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`)
+- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
+- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
+- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
+- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
+- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`)
+- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
+- Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`)
+- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
+- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
+- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
+- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`)
+- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`)
+- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_220.notable_bug_fixes:
+
+Notable bug fixes
+~~~~~~~~~~~~~~~~~
+
+These are bug fixes that might have notable behavior changes.
+
+.. _whatsnew_220.notable_bug_fixes.merge_sort_behavior:
+
+:func:`merge` and :meth:`DataFrame.join` now consistently follow documented sort behavior
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not
+always return a result that followed the documented sort behavior. pandas now
+follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`, :issue:`56443`).
+
+As documented, ``sort=True`` sorts the join keys lexicographically in the resulting
+:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the
+join type (``how`` keyword):
+
+- ``how="left"``: preserve the order of the left keys
+- ``how="right"``: preserve the order of the right keys
+- ``how="inner"``: preserve the order of the left keys
+- ``how="outer"``: sort keys lexicographically
+
+One example with changing behavior is inner joins with non-unique left join keys
+and ``sort=False``:
+
+.. ipython:: python
+
+ left = pd.DataFrame({"a": [1, 2, 1]})
+ right = pd.DataFrame({"a": [1, 2]})
+ result = pd.merge(left, right, how="inner", on="a", sort=False)
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+ In [5]: result
+ Out[5]:
+ a
+ 0 1
+ 1 1
+ 2 2
+
+*New Behavior*
+
+.. ipython:: python
+
+ result
+
+.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels:
+
+:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder
+index levels when joining on two indexes with different levels (:issue:`34133`).
+
+.. ipython:: python
+
+ left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"]))
+ right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"]))
+ left
+ right
+ result = left.join(right)
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+ In [5]: result
+ Out[5]:
+ left right
+ B A C
+ 1 x 1 1 2
+ 2 x 2 1 2
+
+*New Behavior*
+
+.. ipython:: python
+
+ result
+
+.. _whatsnew_220.api_breaking.deps:
+
+Increased minimum versions for dependencies
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+For `optional dependencies `_ the general recommendation is to use the latest version.
+Optional dependencies below the lowest tested version may still work but are not considered supported.
+The following table lists the optional dependencies that have had their minimum tested version increased.
+
++-----------------+---------------------+
+| Package | New Minimum Version |
++=================+=====================+
+| beautifulsoup4 | 4.11.2 |
++-----------------+---------------------+
+| blosc | 1.21.3 |
++-----------------+---------------------+
+| bottleneck | 1.3.6 |
++-----------------+---------------------+
+| fastparquet | 2022.12.0 |
++-----------------+---------------------+
+| fsspec | 2022.11.0 |
++-----------------+---------------------+
+| gcsfs | 2022.11.0 |
++-----------------+---------------------+
+| lxml | 4.9.2 |
++-----------------+---------------------+
+| matplotlib | 3.6.3 |
++-----------------+---------------------+
+| numba | 0.56.4 |
++-----------------+---------------------+
+| numexpr | 2.8.4 |
++-----------------+---------------------+
+| qtpy | 2.3.0 |
++-----------------+---------------------+
+| openpyxl | 3.1.0 |
++-----------------+---------------------+
+| psycopg2 | 2.9.6 |
++-----------------+---------------------+
+| pyreadstat | 1.2.0 |
++-----------------+---------------------+
+| pytables | 3.8.0 |
++-----------------+---------------------+
+| pyxlsb | 1.0.10 |
++-----------------+---------------------+
+| s3fs | 2022.11.0 |
++-----------------+---------------------+
+| scipy | 1.10.0 |
++-----------------+---------------------+
+| sqlalchemy | 2.0.0 |
++-----------------+---------------------+
+| tabulate | 0.9.0 |
++-----------------+---------------------+
+| xarray | 2022.12.0 |
++-----------------+---------------------+
+| xlsxwriter | 3.0.5 |
++-----------------+---------------------+
+| zstandard | 0.19.0 |
++-----------------+---------------------+
+| pyqt5 | 5.15.8 |
++-----------------+---------------------+
+| tzdata | 2022.7 |
++-----------------+---------------------+
+
+See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
+
+.. _whatsnew_220.api_breaking.other:
+
+Other API changes
+^^^^^^^^^^^^^^^^^
+- The hash values of nullable extension dtypes changed to improve the performance of the hashing operation (:issue:`56507`)
+- ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_220.deprecations:
+
+Deprecations
+~~~~~~~~~~~~
+
+Chained assignment
+^^^^^^^^^^^^^^^^^^
+
+In preparation of larger upcoming changes to the copy / view behaviour in pandas 3.0
+(:ref:`copy_on_write`, PDEP-7), we started deprecating *chained assignment*.
+
+Chained assignment occurs when you try to update a pandas DataFrame or Series through
+two subsequent indexing operations. Depending on the type and order of those operations
+this currently does or does not work.
+
+A typical example is as follows:
+
+.. code-block:: python
+
+ df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
+
+ # first selecting rows with a mask, then assigning values to a column
+ # -> this has never worked and raises a SettingWithCopyWarning
+ df[df["bar"] > 5]["foo"] = 100
+
+ # first selecting the column, and then assigning to a subset of that column
+ # -> this currently works
+ df["foo"][df["bar"] > 5] = 100
+
+This second example of chained assignment currently works to update the original ``df``.
+This will no longer work in pandas 3.0, and therefore we started deprecating this:
+
+.. code-block:: python
+
+ >>> df["foo"][df["bar"] > 5] = 100
+ FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
+ You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
+ A typical example is when you are setting values in a column of a DataFrame, like:
+
+ df["col"][row_indexer] = value
+
+ Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.
+
+ See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+
+You can fix this warning and ensure your code is ready for pandas 3.0 by removing
+the usage of chained assignment. Typically, this can be done by doing the assignment
+in a single step using for example ``.loc``. For the example above, we can do:
+
+.. code-block:: python
+
+ df.loc[df["bar"] > 5, "foo"] = 100
+
+The same deprecation applies to inplace methods that are done in a chained manner, such as:
+
+.. code-block:: python
+
+ >>> df["foo"].fillna(0, inplace=True)
+ FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
+ The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
+
+ For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
+
+When the goal is to update the column in the DataFrame ``df``, the alternative here is
+to call the method on ``df`` itself, such as ``df.fillna({"foo": 0}, inplace=True)``.
+
+See more details in the :ref:`migration guide `.
+
+
+Deprecate aliases ``M``, ``Q``, ``Y``, etc. in favour of ``ME``, ``QE``, ``YE``, etc. for offsets
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Deprecated the following frequency aliases (:issue:`9586`):
+
++-------------------------------+------------------+------------------+
+|offsets |deprecated aliases|new aliases |
++===============================+==================+==================+
+|:class:`MonthEnd` | ``M`` | ``ME`` |
++-------------------------------+------------------+------------------+
+|:class:`BusinessMonthEnd` | ``BM`` | ``BME`` |
++-------------------------------+------------------+------------------+
+|:class:`SemiMonthEnd` | ``SM`` | ``SME`` |
++-------------------------------+------------------+------------------+
+|:class:`CustomBusinessMonthEnd`| ``CBM`` | ``CBME`` |
++-------------------------------+------------------+------------------+
+|:class:`QuarterEnd` | ``Q`` | ``QE`` |
++-------------------------------+------------------+------------------+
+|:class:`BQuarterEnd` | ``BQ`` | ``BQE`` |
++-------------------------------+------------------+------------------+
+|:class:`YearEnd` | ``Y`` | ``YE`` |
++-------------------------------+------------------+------------------+
+|:class:`BYearEnd` | ``BY`` | ``BYE`` |
++-------------------------------+------------------+------------------+
+
+For example:
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+ In [8]: pd.date_range('2020-01-01', periods=3, freq='Q-NOV')
+ Out[8]:
+ DatetimeIndex(['2020-02-29', '2020-05-31', '2020-08-31'],
+ dtype='datetime64[ns]', freq='Q-NOV')
+
+*Future behavior*:
+
+.. ipython:: python
+
+ pd.date_range('2020-01-01', periods=3, freq='QE-NOV')
+
+Deprecated automatic downcasting
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Deprecated the automatic downcasting of object dtype results in a number of
+methods. These would silently change the dtype in a hard to predict manner since the
+behavior was value dependent. Additionally, pandas is moving away from silent dtype
+changes (:issue:`54710`, :issue:`54261`).
+
+These methods are:
+
+- :meth:`Series.replace` and :meth:`DataFrame.replace`
+- :meth:`DataFrame.fillna`, :meth:`Series.fillna`
+- :meth:`DataFrame.ffill`, :meth:`Series.ffill`
+- :meth:`DataFrame.bfill`, :meth:`Series.bfill`
+- :meth:`DataFrame.mask`, :meth:`Series.mask`
+- :meth:`DataFrame.where`, :meth:`Series.where`
+- :meth:`DataFrame.clip`, :meth:`Series.clip`
+
+Explicitly call :meth:`DataFrame.infer_objects` to replicate the current behavior in the future.
+
+.. code-block:: ipython
+
+ result = result.infer_objects(copy=False)
+
+Or explicitly cast all-round floats to ints using ``astype``.
+
+Set the following option to opt into the future behavior:
+
+.. code-block:: ipython
+
+ In [9]: pd.set_option("future.no_silent_downcasting", True)
+
+Other Deprecations
+^^^^^^^^^^^^^^^^^^
+- Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`)
+- Deprecated :attr:`offsets.Day.delta`, :attr:`offsets.Hour.delta`, :attr:`offsets.Minute.delta`, :attr:`offsets.Second.delta`, :attr:`offsets.Milli.delta`, :attr:`offsets.Micro.delta`, :attr:`offsets.Nano.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`)
+- Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`)
+- Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`)
+- Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`)
+- Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`)
+- Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`)
+- Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`)
+- Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`)
+- Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`)
+- Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`)
+- Deprecated :meth:`offsets.Tick.is_anchored`, use ``False`` instead (:issue:`55388`)
+- Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`)
+- Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`)
+- Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`)
+- Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf`` (:issue:`54229`)
+- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer`` (:issue:`54229`)
+- Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`)
+- Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`)
+- Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`)
+- Deprecated dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when giving a pandas input, call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`)
+- Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`)
+- Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`)
+- Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`)
+- Deprecated not passing a tuple to :class:`.DataFrameGroupBy.get_group` or :class:`.SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`)
+- Deprecated string ``AS`` denoting frequency in :class:`YearBegin` and strings ``AS-DEC``, ``AS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`)
+- Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`)
+- Deprecated string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`)
+- Deprecated string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`)
+- Deprecated strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`52536`)
+- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
+- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
+- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
+- Deprecated support for combining parsed datetime columns in :func:`read_csv` along with the ``keep_date_col`` keyword (:issue:`55569`)
+- Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`)
+- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
+- Deprecated the ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep="\\s+"`` instead (:issue:`55569`)
+- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
+- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
+- Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`)
+- Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`)
+- Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`)
+- Deprecated the ``verbose`` keyword in :func:`read_csv` and :func:`read_table` (:issue:`55569`)
+- Deprecated the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype`; in a future version replace will change the values while preserving the categories. To change the categories, use ``ser.cat.rename_categories`` instead (:issue:`55147`)
+- Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`)
+- Deprecated the default of ``observed=False`` in :meth:`DataFrame.pivot_table`; will be ``True`` in a future version (:issue:`56236`)
+- Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`)
+- Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`)
+- Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`)
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_220.performance:
+
+Performance improvements
+~~~~~~~~~~~~~~~~~~~~~~~~
+- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` (:issue:`55949`, :issue:`55971`)
+- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`)
+- Performance improvement in :func:`get_dummies` (:issue:`56089`)
+- Performance improvement in :func:`merge` and :func:`merge_ordered` when joining on sorted ascending keys (:issue:`56115`)
+- Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`)
+- Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`)
+- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
+- Performance improvement in :meth:`DataFrame.join` when joining on unordered categorical indexes (:issue:`56345`)
+- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` when indexing with a :class:`MultiIndex` (:issue:`56062`)
+- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
+- Performance improvement in :meth:`DataFrame.to_dict` on converting DataFrame to dictionary (:issue:`50990`)
+- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
+- Performance improvement in :meth:`Index.sort_values` when index is already sorted (:issue:`56128`)
+- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
+- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
+- Performance improvement in :meth:`Series.str.get_dummies` when dtype is ``"string[pyarrow]"`` or ``"string[pyarrow_numpy]"`` (:issue:`56110`)
+- Performance improvement in :meth:`Series.str` methods (:issue:`55736`)
+- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`)
+- Performance improvement in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` (:issue:`55972`)
+- Performance improvement in :meth:`.SeriesGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin` (:issue:`54234`)
+- Performance improvement when hashing a nullable extension array (:issue:`56507`)
+- Performance improvement when indexing into a non-unique index (:issue:`55816`)
+- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
+- Performance improvement when localizing time to UTC (:issue:`55241`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_220.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+Categorical
+^^^^^^^^^^^
+- :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`)
+- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`)
+- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`)
+
+Datetimelike
+^^^^^^^^^^^^
+- Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`)
+- Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`)
+- Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`)
+- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame (:issue:`52093`)
+- Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`)
+- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`)
+- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`)
+- Bug in :meth:`.Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
+- Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`)
+- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
+- Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`)
+- Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`)
+- Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`)
+- Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`)
+- Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`)
+- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetime64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`)
+- Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`)
+- Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`)
+- Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`)
+- Bug in addition or subtraction of very large :class:`.Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
+- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`)
+- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`)
+- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
+- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
+- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
+- Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`)
+- Fixed regression where :func:`concat` would raise an error when concatenating ``datetime64`` columns with differing resolutions (:issue:`53641`)
+
+Timedelta
+^^^^^^^^^
+- Bug in :class:`Timedelta` construction raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
+- Bug in rendering (``__repr__``) of :class:`TimedeltaIndex` and :class:`Series` with timedelta64 values with non-nanosecond resolution entries that are all multiples of 24 hours failing to use the compact representation used in the nanosecond cases (:issue:`55405`)
+
+Timezones
+^^^^^^^^^
+- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
+- Bug in :class:`Timestamp` construction with an ambiguous value and a ``pytz`` timezone failing to raise ``pytz.AmbiguousTimeError`` (:issue:`55657`)
+- Bug in :meth:`Timestamp.tz_localize` with ``nonexistent="shift_forward`` around UTC+0 during DST (:issue:`51501`)
+
+Numeric
+^^^^^^^
+- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
+- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
+- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
+- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`)
+- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`)
+
+Conversion
+^^^^^^^^^^
+- Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`)
+- Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`)
+- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`)
+- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`)
+
+Strings
+^^^^^^^
+- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`)
+- Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`)
+- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`)
+- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`)
+- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`)
+- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`)
+- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
+- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
+- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`)
+- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
+- Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`)
+
+Interval
+^^^^^^^^
+- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`)
+- Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`)
+- Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`)
+- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`)
+- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
+- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
+- Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`)
+
+Indexing
+^^^^^^^^
+- Bug in :meth:`DataFrame.loc` mutating a boolean indexer when :class:`DataFrame` has a :class:`MultiIndex` (:issue:`56635`)
+- Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`)
+- Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`)
+- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`)
+- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`)
+
+Missing
+^^^^^^^
+- Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`)
+
+MultiIndex
+^^^^^^^^^^
+- Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`)
+
+I/O
+^^^
+- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified (:issue:`56323`)
+- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified (:issue:`55677`)
+- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raising a Python warning; this now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
+- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``quotechar`` was ignored (:issue:`52266`)
+- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a CSV with no headers (:issue:`54459`)
+- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when the file contains ``NaN`` or ``Inf`` (:issue:`54564`)
+- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`)
+- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`)
+- Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
+- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`)
+- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`)
+- Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
+- Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)
+
+Period
+^^^^^^
+- Bug in :class:`PeriodIndex` construction when more than one of ``data``, ``ordinal`` and ``**fields`` are passed failing to raise ``ValueError`` (:issue:`55961`)
+- Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`)
+- Bug in casting from :class:`PeriodDtype` with ``astype`` to ``datetime64`` or :class:`DatetimeTZDtype` with non-nanosecond unit incorrectly returning with nanosecond unit (:issue:`55958`)
+
+Plotting
+^^^^^^^^
+- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a Matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`)
+- Bug in :meth:`DataFrame.plot.scatter` discarding string columns (:issue:`56142`)
+- Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`)
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`)
+- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`)
+- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`)
+- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`)
+- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`)
+- Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`)
+- Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`)
+- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`)
+- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`)
+- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
+- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`)
+- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
+- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
+- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
+- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`)
+
+Reshaping
+^^^^^^^^^
+- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`)
+- Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`)
+- Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`)
+- Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`)
+- Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
+- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
+- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
+- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
+- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
+- Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
+- Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
+- Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`)
+- Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`)
+- Bug in :meth:`DataFrame.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`)
+
+Sparse
+^^^^^^
+- Bug in :meth:`arrays.SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`)
+
+Other
+^^^^^
+- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
+- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
+- Bug in :func:`api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`)
+- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
+- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
+- Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`)
+- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
+- Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`)
+- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`)
+- Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`)
+- Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`)
+- Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`)
+- Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`)
+- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_220.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v2.1.4..v2.2.0|HEAD
diff --git a/environment.yml b/environment.yml
index 3a0da0bfc703d..58eb69ad1f070 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,56 +8,56 @@ dependencies:
# build dependencies
- versioneer[toml]
- - cython=0.29.33
- - meson[ninja]=1.0.1
+ - cython=3.0.5
+ - meson[ninja]=1.2.1
- meson-python=0.13.1
# test dependencies
- pytest>=7.3.2
- pytest-cov
- pytest-xdist>=2.2.0
- - pytest-asyncio>=0.17.0
+ - pytest-qt>=4.2.0
+ - pyqt>=5.15.9
- coverage
# required dependencies
- python-dateutil
- - numpy
+ - numpy<2
- pytz
# optional dependencies
- - beautifulsoup4>=4.11.1
+ - beautifulsoup4>=4.11.2
- blosc
- - brotlipy>=0.7.0
- - bottleneck>=1.3.4
- - fastparquet>=0.8.1
- - fsspec>=2022.05.0
+ - bottleneck>=1.3.6
+ - fastparquet>=2022.12.0
+ - fsspec>=2022.11.0
- html5lib>=1.1
- hypothesis>=6.46.1
- - gcsfs>=2022.05.0
+ - gcsfs>=2022.11.0
- ipython
- jinja2>=3.1.2
- - lxml>=4.8.0
- - matplotlib>=3.6.1
- - numba>=0.55.2
- - numexpr>=2.8.0
- - openpyxl>=3.0.10
+ - lxml>=4.9.2
+ - matplotlib>=3.6.3
+ - numba>=0.56.4
+ - numexpr>=2.8.4
+ - openpyxl>=3.1.0
- odfpy>=1.4.1
- py
- - psycopg2>=2.9.3
- - pyarrow>=7.0.0
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
- pymysql>=1.0.2
- - pyreadstat>=1.1.5
- - pytables>=3.7.0
- - python-snappy>=0.6.1
- - pyxlsb>=1.0.9
- - s3fs>=2022.05.0
- - scipy>=1.8.1
- - sqlalchemy>=1.4.36
- - tabulate>=0.8.10
- - xarray>=2022.03.0
+ - pyreadstat>=1.2.0
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pyxlsb>=1.0.10
+ - s3fs>=2022.11.0
+ - scipy>=1.10.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2022.12.0
- xlrd>=2.0.1
- - xlsxwriter>=3.0.3
- - zstandard>=0.17.0
+ - xlsxwriter>=3.0.5
+ - zstandard>=0.19.0
# downstream packages
- dask-core
@@ -68,17 +68,17 @@ dependencies:
- flask
# benchmarks
- - asv>=0.5.1
+ - asv>=0.6.1
## The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms.
- c-compiler
- cxx-compiler
# code checks
- - flake8=6.0.0 # run in subprocess over docstring examples
- - mypy=1.4.1 # pre-commit uses locally installed mypy
+ - flake8=6.1.0 # run in subprocess over docstring examples
+ - mypy=1.8.0 # pre-commit uses locally installed mypy
- tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py
- - pre-commit>=2.15.0
+ - pre-commit>=3.6.0
# documentation
- gitpython # obtain contributors from git for whatsnew
@@ -86,7 +86,7 @@ dependencies:
- google-auth
- natsort # DataFrame.sort_values doctest
- numpydoc
- - pydata-sphinx-theme
+ - pydata-sphinx-theme=0.14
- pytest-cython # doctest
- sphinx
- sphinx-design
@@ -98,16 +98,16 @@ dependencies:
- types-setuptools
# documentation (jupyter notebooks)
- - nbconvert>=6.4.5
+ - nbconvert>=7.11.0
- nbsphinx
- pandoc
- ipywidgets
- nbformat
- - notebook>=6.0.3
+ - notebook>=7.0.6
- ipykernel
# web
- - jinja2 # in optional dependencies, but documented here as needed
+ # - jinja2 # already listed in optional dependencies, but documented here for reference
- markdown
- feedparser
- pyyaml
@@ -115,7 +115,8 @@ dependencies:
- pygments # Code highlighting
- pip:
+ - adbc-driver-postgresql>=0.8.0
+ - adbc-driver-sqlite>=0.8.0
- dataframe-api-compat>=0.1.7
- - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2
- typing_extensions; python_version<"3.11"
- - tzdata>=2022.1
+ - tzdata>=2022.7
diff --git a/generate_pxi.py b/generate_pxi.py
index 47648a3937b4c..e7e85900aa6e9 100644
--- a/generate_pxi.py
+++ b/generate_pxi.py
@@ -4,7 +4,7 @@
from Cython import Tempita
-def process_tempita(pxifile, outfile):
+def process_tempita(pxifile, outfile) -> None:
with open(pxifile, encoding="utf-8") as f:
tmpl = f.read()
pyxcontent = Tempita.sub(tmpl)
@@ -13,7 +13,7 @@ def process_tempita(pxifile, outfile):
f.write(pyxcontent)
-def main():
+def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("infile", type=str, help="Path to the input file")
parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
diff --git a/generate_version.py b/generate_version.py
index 5534f49c0ea58..7dc1bdcf329d0 100644
--- a/generate_version.py
+++ b/generate_version.py
@@ -1,21 +1,35 @@
+#!/usr/bin/env python3
+
# Note: This file has to live next to setup.py or versioneer will not work
import argparse
import os
+import sys
import versioneer
+sys.path.insert(0, "")
+
+
+def write_version_info(path) -> None:
+ version = None
+ git_version = None
-def write_version_info(path):
+ try:
+ import _version_meson
+
+ version = _version_meson.__version__
+ git_version = _version_meson.__git_version__
+ except ImportError:
+ version = versioneer.get_version()
+ git_version = versioneer.get_versions()["full-revisionid"]
if os.environ.get("MESON_DIST_ROOT"):
path = os.path.join(os.environ.get("MESON_DIST_ROOT"), path)
with open(path, "w", encoding="utf-8") as file:
- file.write(f'__version__="{versioneer.get_version()}"\n')
- file.write(
- f'__git_version__="{versioneer.get_versions()["full-revisionid"]}"\n'
- )
+ file.write(f'__version__="{version}"\n')
+ file.write(f'__git_version__="{git_version}"\n')
-def main():
+def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-o",
@@ -43,7 +57,13 @@ def main():
write_version_info(args.outfile)
if args.print:
- print(versioneer.get_version())
+ try:
+ import _version_meson
+
+ version = _version_meson.__version__
+ except ImportError:
+ version = versioneer.get_version()
+ print(version)
main()
diff --git a/meson.build b/meson.build
index a927b59abeaf9..06623a305ab54 100644
--- a/meson.build
+++ b/meson.build
@@ -2,24 +2,18 @@
project(
'pandas',
'c', 'cpp', 'cython',
- version: run_command(['python', 'generate_version.py', '--print'], check: true).stdout().strip(),
+ version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(),
license: 'BSD-3',
- meson_version: '>=1.0.1',
+ meson_version: '>=1.2.1',
default_options: [
- # TODO: investigate, does meson try to compile against debug Python
- # when buildtype = debug, this seems to be causing problems on CI
- # where provided Python is not compiled in debug mode
'buildtype=release',
- # TODO: Reactivate werror, some warnings on Windows
- #'werror=true',
- 'c_std=c99'
+ 'c_std=c11',
+ 'warning_level=2',
]
)
-py_mod = import('python')
fs = import('fs')
-py = py_mod.find_installation('python')
-py_dep = py.dependency()
+py = import('python').find_installation(pure: false)
tempita = files('generate_pxi.py')
versioneer = files('generate_version.py')
@@ -35,7 +29,7 @@ add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c
if fs.exists('_version_meson.py')
- py.install_sources('_version_meson.py', pure: false, subdir: 'pandas')
+ py.install_sources('_version_meson.py', subdir: 'pandas')
else
custom_target('write_version_file',
output: '_version_meson.py',
@@ -45,11 +39,15 @@ else
build_by_default: true,
build_always_stale: true,
install: true,
- install_dir: py.get_install_dir(pure: false) / 'pandas'
+ install_dir: py.get_install_dir() / 'pandas'
)
meson.add_dist_script(py, versioneer, '-o', '_version_meson.py')
endif
# Needed by pandas.test() when it looks for the pytest ini options
-py.install_sources('pyproject.toml', pure: false, subdir: 'pandas')
+py.install_sources(
+ 'pyproject.toml',
+ subdir: 'pandas'
+)
+
subdir('pandas')
diff --git a/pandas/__init__.py b/pandas/__init__.py
index d11a429987ac4..ed524c2bb3619 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -1,5 +1,8 @@
from __future__ import annotations
+import os
+import warnings
+
__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
@@ -21,7 +24,7 @@
try:
# numpy compat
from pandas.compat import (
- is_numpy_dev as _is_numpy_dev, # pyright: ignore[reportUnusedImport] # noqa: F401,E501
+ is_numpy_dev as _is_numpy_dev, # pyright: ignore[reportUnusedImport] # noqa: F401
)
except ImportError as _err: # pragma: no cover
_module = _err.name
@@ -190,6 +193,46 @@
__git_version__ = v.get("full-revisionid")
del get_versions, v
+# GH#55043 - deprecation of the data_manager option
+if "PANDAS_DATA_MANAGER" in os.environ:
+ warnings.warn(
+ "The env variable PANDAS_DATA_MANAGER is set. The data_manager option is "
+ "deprecated and will be removed in a future version. Only the BlockManager "
+ "will be available. Unset this environment variable to silence this warning.",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+# DeprecationWarning for missing pyarrow
+from pandas.compat.pyarrow import pa_version_under10p1, pa_not_found
+
+if pa_version_under10p1:
+ # pyarrow is either too old or nonexistent, warn
+ from pandas.compat._optional import VERSIONS
+
+ if pa_not_found:
+ pa_msg = "was not found to be installed on your system."
+ else:
+ pa_msg = (
+ f"was too old on your system - pyarrow {VERSIONS['pyarrow']} "
+ "is the current minimum supported version as of this release."
+ )
+
+ warnings.warn(
+ f"""
+Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
+(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
+but {pa_msg}
+If this would cause problems for you,
+please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
+ """, # noqa: E501
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ del VERSIONS, pa_msg
+
+# Delete all unnecessary imported modules
+del pa_version_under10p1, pa_not_found, warnings, os
# module level doc-string
__doc__ = """
diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
index daeb135f5bcf7..97784c924dab4 100644
--- a/pandas/_config/__init__.py
+++ b/pandas/_config/__init__.py
@@ -15,6 +15,7 @@
"option_context",
"options",
"using_copy_on_write",
+ "warn_copy_on_write",
]
from pandas._config import config
from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401
@@ -32,7 +33,18 @@
def using_copy_on_write() -> bool:
_mode_options = _global_config["mode"]
- return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block"
+ return (
+ _mode_options["copy_on_write"] is True
+ and _mode_options["data_manager"] == "block"
+ )
+
+
+def warn_copy_on_write() -> bool:
+ _mode_options = _global_config["mode"]
+ return (
+ _mode_options["copy_on_write"] == "warn"
+ and _mode_options["data_manager"] == "block"
+ )
def using_nullable_dtypes() -> bool:
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index a776825732090..c391939d22491 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -220,6 +220,8 @@ def get_default_val(pat: str):
class DictWrapper:
"""provide attribute-style access to a nested dict"""
+ d: dict[str, Any]
+
def __init__(self, d: dict[str, Any], prefix: str = "") -> None:
object.__setattr__(self, "d", d)
object.__setattr__(self, "prefix", prefix)
@@ -250,7 +252,7 @@ def __getattr__(self, key: str):
else:
return _get_option(prefix)
- def __dir__(self) -> Iterable[str]:
+ def __dir__(self) -> list[str]:
return list(self.d.keys())
diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py
index b084a25917163..26a872a90e493 100644
--- a/pandas/_libs/__init__.py
+++ b/pandas/_libs/__init__.py
@@ -13,8 +13,8 @@
# Below imports needs to happen first to ensure pandas top level
# module gets monkeypatched with the pandas_datetime_CAPI
# see pandas_datetime_exec in pd_datetime.c
-import pandas._libs.pandas_parser # noqa: E501 # isort: skip # type: ignore[reportUnusedImport]
-import pandas._libs.pandas_datetime # noqa: F401,E501 # isort: skip # type: ignore[reportUnusedImport]
+import pandas._libs.pandas_parser # isort: skip # type: ignore[reportUnusedImport]
+import pandas._libs.pandas_datetime # noqa: F401 # isort: skip # type: ignore[reportUnusedImport]
from pandas._libs.interval import Interval
from pandas._libs.tslibs import (
NaT,
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 0b6ea58f987d4..e70ac26a2c28e 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -59,7 +59,7 @@ from pandas._libs.util cimport get_nat
cdef:
float64_t FP_ERR = 1e-13
- float64_t NaN = np.NaN
+ float64_t NaN = np.nan
int64_t NPY_NAT = get_nat()
@@ -814,7 +814,7 @@ def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike):
return True, True, True
if timelike and arr[0] == NPY_NAT:
- return False, False, True
+ return False, False, False
if numeric_object_t is not object:
with nogil:
@@ -998,8 +998,7 @@ def rank_1d(
N = len(values)
if labels is not None:
- # TODO(cython3): cast won't be necessary (#2992)
- assert len(labels) == N
+ assert len(labels) == N
out = np.empty(N)
grp_sizes = np.ones(N, dtype=np.int64)
@@ -1145,107 +1144,7 @@ cdef void rank_sorted_1d(
# that sorted value for retrieval back from the original
# values / masked_vals arrays
# TODO(cython3): de-duplicate once cython supports conditional nogil
- if numeric_object_t is object:
- with gil:
- for i in range(N):
- at_end = i == N - 1
-
- # dups and sum_ranks will be incremented each loop where
- # the value / group remains the same, and should be reset
- # when either of those change. Used to calculate tiebreakers
- dups += 1
- sum_ranks += i - grp_start + 1
-
- next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
- masked_vals[sort_indexer[i+1]])
-
- # We'll need this check later anyway to determine group size, so just
- # compute it here since shortcircuiting won't help
- group_changed = at_end or (check_labels and
- (labels[sort_indexer[i]]
- != labels[sort_indexer[i+1]]))
-
- # Update out only when there is a transition of values or labels.
- # When a new value or group is encountered, go back #dups steps(
- # the number of occurrence of current value) and assign the ranks
- # based on the starting index of the current group (grp_start)
- # and the current index
- if (next_val_diff or group_changed or (check_mask and
- (mask[sort_indexer[i]]
- ^ mask[sort_indexer[i+1]]))):
-
- # If keep_na, check for missing values and assign back
- # to the result where appropriate
- if keep_na and check_mask and mask[sort_indexer[i]]:
- grp_na_count = dups
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = NaN
- elif tiebreak == TIEBREAK_AVERAGE:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = sum_ranks / dups
- elif tiebreak == TIEBREAK_MIN:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = i - grp_start - dups + 2
- elif tiebreak == TIEBREAK_MAX:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = i - grp_start + 1
-
- # With n as the previous rank in the group and m as the number
- # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
- # then rankings should be n + 1, n + 2 ... n + m
- elif tiebreak == TIEBREAK_FIRST:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = j + 1 - grp_start
-
- # If TIEBREAK_FIRST and descending, the ranking should be
- # n + m, n + (m - 1) ... n + 1. This is equivalent to
- # (i - dups + 1) + (i - j + 1) - grp_start
- elif tiebreak == TIEBREAK_FIRST_DESCENDING:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
- elif tiebreak == TIEBREAK_DENSE:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = grp_vals_seen
-
- # Look forward to the next value (using the sorting in
- # lexsort_indexer). If the value does not equal the current
- # value then we need to reset the dups and sum_ranks, knowing
- # that a new value is coming up. The conditional also needs
- # to handle nan equality and the end of iteration. If group
- # changes we do not record seeing a new value in the group
- if not group_changed and (next_val_diff or (check_mask and
- (mask[sort_indexer[i]]
- ^ mask[sort_indexer[i+1]]))):
- dups = sum_ranks = 0
- grp_vals_seen += 1
-
- # Similar to the previous conditional, check now if we are
- # moving to a new group. If so, keep track of the index where
- # the new group occurs, so the tiebreaker calculations can
- # decrement that from their position. Fill in the size of each
- # group encountered (used by pct calculations later). Also be
- # sure to reset any of the items helping to calculate dups
- if group_changed:
-
- # If not dense tiebreak, group size used to compute
- # percentile will be # of non-null elements in group
- if tiebreak != TIEBREAK_DENSE:
- grp_size = i - grp_start + 1 - grp_na_count
-
- # Otherwise, it will be the number of distinct values
- # in the group, subtracting 1 if NaNs are present
- # since that is a distinct value we shouldn't count
- else:
- grp_size = grp_vals_seen - (grp_na_count > 0)
-
- for j in range(grp_start, i + 1):
- grp_sizes[sort_indexer[j]] = grp_size
-
- dups = sum_ranks = 0
- grp_na_count = 0
- grp_start = i + 1
- grp_vals_seen = 1
- else:
+ with gil(numeric_object_t is object):
for i in range(N):
at_end = i == N - 1
@@ -1255,8 +1154,12 @@ cdef void rank_sorted_1d(
dups += 1
sum_ranks += i - grp_start + 1
- next_val_diff = at_end or (masked_vals[sort_indexer[i]]
- != masked_vals[sort_indexer[i+1]])
+ if numeric_object_t is object:
+ next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
+ masked_vals[sort_indexer[i+1]])
+ else:
+ next_val_diff = at_end or (masked_vals[sort_indexer[i]]
+ != masked_vals[sort_indexer[i+1]])
# We'll need this check later anyway to determine group size, so just
# compute it here since shortcircuiting won't help
@@ -1269,10 +1172,9 @@ cdef void rank_sorted_1d(
# the number of occurrence of current value) and assign the ranks
# based on the starting index of the current group (grp_start)
# and the current index
- if (next_val_diff or group_changed
- or (check_mask and
- (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):
-
+ if (next_val_diff or group_changed or (check_mask and
+ (mask[sort_indexer[i]]
+ ^ mask[sort_indexer[i+1]]))):
# If keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and check_mask and mask[sort_indexer[i]]:
@@ -1483,7 +1385,7 @@ def diff_2d(
cdef:
Py_ssize_t i, j, sx, sy, start, stop
bint f_contig = arr.flags.f_contiguous
- # bint f_contig = arr.is_f_contig() # TODO(cython3)
+ # bint f_contig = arr.is_f_contig() # TODO(cython3) once arr is memoryview
diff_t left, right
# Disable for unsupported dtype combinations,
diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
index 78fee8f01319c..86f69c3cdfc75 100644
--- a/pandas/_libs/arrays.pyi
+++ b/pandas/_libs/arrays.pyi
@@ -26,7 +26,7 @@ class NDArrayBacked:
def size(self) -> int: ...
@property
def nbytes(self) -> int: ...
- def copy(self): ...
+ def copy(self, order=...): ...
def delete(self, loc, axis=...): ...
def swapaxes(self, axis1, axis2): ...
def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...
diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx
index 718fb358e26bc..9889436a542c1 100644
--- a/pandas/_libs/arrays.pyx
+++ b/pandas/_libs/arrays.pyx
@@ -126,8 +126,7 @@ cdef class NDArrayBacked:
@property
def size(self) -> int:
- # TODO(cython3): use self._ndarray.size
- return cnp.PyArray_SIZE(self._ndarray)
+ return self._ndarray.size
@property
def nbytes(self) -> int:
diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd
index ccfb2d2ef4a23..de4b70d387b5f 100644
--- a/pandas/_libs/dtypes.pxd
+++ b/pandas/_libs/dtypes.pxd
@@ -34,3 +34,8 @@ ctypedef fused numeric_t:
ctypedef fused numeric_object_t:
numeric_t
object
+
+ctypedef fused uint8_int64_object_t:
+ uint8_t
+ int64_t
+ object
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
index d165ddd6c8afa..135828a23648a 100644
--- a/pandas/_libs/groupby.pyi
+++ b/pandas/_libs/groupby.pyi
@@ -44,7 +44,6 @@ def group_fillna_indexer(
labels: np.ndarray, # ndarray[int64_t]
sorted_labels: npt.NDArray[np.intp],
mask: npt.NDArray[np.uint8],
- direction: Literal["ffill", "bfill"],
limit: int, # int64_t
dropna: bool,
) -> None: ...
@@ -55,7 +54,7 @@ def group_any_all(
mask: np.ndarray, # const uint8_t[::1]
val_test: Literal["any", "all"],
skipna: bool,
- nullable: bool,
+ result_mask: np.ndarray | None,
) -> None: ...
def group_sum(
out: np.ndarray, # complexfloatingintuint_t[:, ::1]
@@ -181,6 +180,18 @@ def group_min(
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
+def group_idxmin_idxmax(
+ out: npt.NDArray[np.intp],
+ counts: npt.NDArray[np.int64],
+ values: np.ndarray, # ndarray[groupby_t, ndim=2]
+ labels: npt.NDArray[np.intp],
+ min_count: int = ...,
+ is_datetimelike: bool = ...,
+ mask: np.ndarray | None = ...,
+ name: str = ...,
+ skipna: bool = ...,
+ result_mask: np.ndarray | None = ...,
+) -> None: ...
def group_cummin(
out: np.ndarray, # groupby_t[:, ::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 20499016f951e..19d71b0a6fde3 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -52,7 +52,7 @@ from pandas._libs.missing cimport checknull
cdef int64_t NPY_NAT = util.get_nat()
-cdef float64_t NaN = np.NaN
+cdef float64_t NaN = np.nan
cdef enum InterpolationEnumType:
INTERPOLATION_LINEAR,
@@ -695,54 +695,37 @@ def group_sum(
N, K = (values).shape
- if sum_t is object:
- # NB: this does not use 'compensation' like the non-object track does.
+ with nogil(sum_t is not object):
for i in range(N):
lab = labels[i]
if lab < 0:
continue
counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- # not nan
- if not checknull(val):
- nobs[lab, j] += 1
- if nobs[lab, j] == 1:
- # i.e. we haven't added anything yet; avoid TypeError
- # if e.g. val is a str and sumx[lab, j] is 0
- t = val
- else:
- t = sumx[lab, j] + val
- sumx[lab, j] = t
-
- for i in range(ncounts):
for j in range(K):
- if nobs[i, j] < min_count:
- out[i, j] = None
+ val = values[i, j]
+ if uses_mask:
+ isna_entry = mask[i, j]
else:
- out[i, j] = sumx[i, j]
- else:
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
+ isna_entry = _treat_as_na(val, is_datetimelike)
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
+ if not isna_entry:
+ nobs[lab, j] += 1
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
+ if sum_t is object:
+ # NB: this does not use 'compensation' like the non-object
+ # track does.
+ if nobs[lab, j] == 1:
+ # i.e. we haven't added anything yet; avoid TypeError
+ # if e.g. val is a str and sumx[lab, j] is 0
+ t = val
+ else:
+ t = sumx[lab, j] + val
+ sumx[lab, j] = t
- if not isna_entry:
- nobs[lab, j] += 1
+ else:
y = val - compensation[lab, j]
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
@@ -755,9 +738,9 @@ def group_sum(
compensation[lab, j] = 0
sumx[lab, j] = t
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
- )
+ _check_below_mincount(
+ out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
+ )
@cython.wraparound(False)
@@ -809,9 +792,9 @@ def group_prod(
nobs[lab, j] += 1
prodx[lab, j] *= val
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
- )
+ _check_below_mincount(
+ out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
+ )
@cython.wraparound(False)
@@ -1320,9 +1303,8 @@ ctypedef fused numeric_object_complex_t:
cdef bint _treat_as_na(numeric_object_complex_t val,
bint is_datetimelike) noexcept nogil:
if numeric_object_complex_t is object:
- # Should never be used, but we need to avoid the `val != val` below
- # or else cython will raise about gil acquisition.
- raise NotImplementedError
+ with gil:
+ return checknull(val)
elif numeric_object_complex_t is int64_t:
return is_datetimelike and val == NPY_NAT
@@ -1369,7 +1351,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike):
ctypedef fused mincount_t:
- numeric_t
+ numeric_object_t
complex64_t
complex128_t
@@ -1385,7 +1367,7 @@ cdef inline void _check_below_mincount(
int64_t[:, ::1] nobs,
int64_t min_count,
mincount_t[:, ::1] resx,
-) noexcept nogil:
+) noexcept:
"""
Check if the number of observations for a group is below min_count,
and if so set the result for that group to the appropriate NA-like value.
@@ -1393,38 +1375,40 @@ cdef inline void _check_below_mincount(
cdef:
Py_ssize_t i, j
- for i in range(ncounts):
- for j in range(K):
-
- if nobs[i, j] < min_count:
- # if we are integer dtype, not is_datetimelike, and
- # not uses_mask, then getting here implies that
- # counts[i] < min_count, which means we will
- # be cast to float64 and masked at the end
- # of WrappedCythonOp._call_cython_op. So we can safely
- # set a placeholder value in out[i, j].
- if uses_mask:
- result_mask[i, j] = True
- # set out[i, j] to 0 to be deterministic, as
- # it was initialized with np.empty. Also ensures
- # we can downcast out if appropriate.
- out[i, j] = 0
- elif (
- mincount_t is float32_t
- or mincount_t is float64_t
- or mincount_t is complex64_t
- or mincount_t is complex128_t
- ):
- out[i, j] = NAN
- elif mincount_t is int64_t:
- # Per above, this is a placeholder in
- # non-is_datetimelike cases.
- out[i, j] = NPY_NAT
+ with nogil(mincount_t is not object):
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] >= min_count:
+ out[i, j] = resx[i, j]
else:
- # placeholder, see above
- out[i, j] = 0
- else:
- out[i, j] = resx[i, j]
+ # if we are integer dtype, not is_datetimelike, and
+ # not uses_mask, then getting here implies that
+ # counts[i] < min_count, which means we will
+ # be cast to float64 and masked at the end
+ # of WrappedCythonOp._call_cython_op. So we can safely
+ # set a placeholder value in out[i, j].
+ if uses_mask:
+ result_mask[i, j] = True
+ # set out[i, j] to 0 to be deterministic, as
+ # it was initialized with np.empty. Also ensures
+ # we can downcast out if appropriate.
+ out[i, j] = 0
+ elif (
+ mincount_t is float32_t
+ or mincount_t is float64_t
+ or mincount_t is complex64_t
+ or mincount_t is complex128_t
+ ):
+ out[i, j] = NAN
+ elif mincount_t is int64_t:
+ # Per above, this is a placeholder in
+ # non-is_datetimelike cases.
+ out[i, j] = NPY_NAT
+ elif mincount_t is object:
+ out[i, j] = None
+ else:
+ # placeholder, see above
+ out[i, j] = 0
# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
@@ -1452,9 +1436,7 @@ def group_last(
bint uses_mask = mask is not None
bint isna_entry
- # TODO(cython3):
- # Instead of `labels.shape[0]` use `len(labels)`
- if not len(values) == labels.shape[0]:
+ if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")
min_count = max(min_count, 1)
@@ -1466,8 +1448,7 @@ def group_last(
N, K = (values).shape
- if numeric_object_t is object:
- # TODO(cython3): De-duplicate once conditional-nogil is available
+ with nogil(numeric_object_t is not object):
for i in range(N):
lab = labels[i]
if lab < 0:
@@ -1480,43 +1461,15 @@ def group_last(
if uses_mask:
isna_entry = mask[i, j]
else:
- isna_entry = checknull(val)
+ isna_entry = _treat_as_na(val, is_datetimelike)
if not isna_entry:
- # TODO(cython3): use _treat_as_na here once
- # conditional-nogil is available.
nobs[lab, j] += 1
resx[lab, j] = val
- for i in range(ncounts):
- for j in range(K):
- if nobs[i, j] < min_count:
- out[i, j] = None
- else:
- out[i, j] = resx[i, j]
- else:
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- resx[lab, j] = val
-
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
- )
+ _check_below_mincount(
+ out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
+ )
# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
@@ -1545,9 +1498,7 @@ def group_nth(
bint uses_mask = mask is not None
bint isna_entry
- # TODO(cython3):
- # Instead of `labels.shape[0]` use `len(labels)`
- if not len(values) == labels.shape[0]:
+ if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")
min_count = max(min_count, 1)
@@ -1559,8 +1510,7 @@ def group_nth(
N, K = (values).shape
- if numeric_object_t is object:
- # TODO(cython3): De-duplicate once conditional-nogil is available
+ with nogil(numeric_object_t is not object):
for i in range(N):
lab = labels[i]
if lab < 0:
@@ -1573,46 +1523,16 @@ def group_nth(
if uses_mask:
isna_entry = mask[i, j]
else:
- isna_entry = checknull(val)
+ isna_entry = _treat_as_na(val, is_datetimelike)
if not isna_entry:
- # TODO(cython3): use _treat_as_na here once
- # conditional-nogil is available.
nobs[lab, j] += 1
if nobs[lab, j] == rank:
resx[lab, j] = val
- for i in range(ncounts):
- for j in range(K):
- if nobs[i, j] < min_count:
- out[i, j] = None
- else:
- out[i, j] = resx[i, j]
-
- else:
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- if nobs[lab, j] == rank:
- resx[lab, j] = val
-
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
- )
+ _check_below_mincount(
+ out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
+ )
@cython.boundscheck(False)
@@ -1752,9 +1672,7 @@ cdef group_min_max(
bint uses_mask = mask is not None
bint isna_entry
- # TODO(cython3):
- # Instead of `labels.shape[0]` use `len(labels)`
- if not len(values) == labels.shape[0]:
+ if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")
min_count = max(min_count, 1)
@@ -1789,10 +1707,121 @@ cdef group_min_max(
if val < group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
- _check_below_mincount(
- out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
+ _check_below_mincount(
+ out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
+ )
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_idxmin_idxmax(
+ intp_t[:, ::1] out,
+ int64_t[::1] counts,
+ ndarray[numeric_object_t, ndim=2] values,
+ const intp_t[::1] labels,
+ Py_ssize_t min_count=-1,
+ bint is_datetimelike=False,
+ const uint8_t[:, ::1] mask=None,
+ str name="idxmin",
+ bint skipna=True,
+ uint8_t[:, ::1] result_mask=None,
+):
+ """
+ Compute index of minimum/maximum of columns of `values`, in row groups `labels`.
+
+ This function only computes the row number where the minimum/maximum occurs, we'll
+ take the corresponding index value after this function.
+
+ Parameters
+ ----------
+ out : np.ndarray[intp, ndim=2]
+ Array to store result in.
+ counts : np.ndarray[int64]
+ Input as a zeroed array, populated by group sizes during algorithm
+ values : np.ndarray[numeric_object_t, ndim=2]
+ Values to find column-wise min/max of.
+ labels : np.ndarray[np.intp]
+ Labels to group by.
+ min_count : Py_ssize_t, default -1
+ The minimum number of non-NA group elements, NA result if threshold
+ is not met.
+ is_datetimelike : bool
+ True if `values` contains datetime-like entries.
+ name : {"idxmin", "idxmax"}, default "idxmin"
+ Whether to compute idxmin or idxmax.
+ mask : ndarray[bool, ndim=2], optional
+ If not None, indices represent missing values,
+ otherwise the mask will not be used
+ skipna : bool, default True
+ Flag to ignore nan values during truth testing
+ result_mask : ndarray[bool, ndim=2], optional
+ If not None, these specify locations in the output that are NA.
+ Modified in-place.
+
+ Notes
+ -----
+ This method modifies the `out` parameter, rather than returning an object.
+ `counts` is modified to hold group sizes
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab
+ numeric_object_t val
+ numeric_object_t[:, ::1] group_min_or_max
+ bint uses_mask = mask is not None
+ bint isna_entry
+ bint compute_max = name == "idxmax"
+
+ assert name == "idxmin" or name == "idxmax"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ N, K = (values).shape
+
+ if numeric_object_t is object:
+ group_min_or_max = np.empty((out).shape, dtype=object)
+ else:
+ group_min_or_max = np.empty_like(out, dtype=values.dtype)
+ if N > 0 and K > 0:
+ # When N or K is zero, we never use group_min_or_max
+ group_min_or_max[:] = _get_min_or_max(
+ values[0, 0], compute_max, is_datetimelike
)
+ # When using transform, we need a valid value for take in the case
+ # a category is not observed; these values will be dropped
+ out[:] = 0
+
+ with nogil(numeric_object_t is not object):
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ for j in range(K):
+ if not skipna and out[lab, j] == -1:
+ # Once we've hit NA there is no going back
+ continue
+ val = values[i, j]
+
+ if uses_mask:
+ isna_entry = mask[i, j]
+ else:
+ isna_entry = _treat_as_na(val, is_datetimelike)
+
+ if isna_entry:
+ if not skipna:
+ out[lab, j] = -1
+ else:
+ if compute_max:
+ if val > group_min_or_max[lab, j]:
+ group_min_or_max[lab, j] = val
+ out[lab, j] = i
+ else:
+ if val < group_min_or_max[lab, j]:
+ group_min_or_max[lab, j] = val
+ out[lab, j] = i
+
@cython.wraparound(False)
@cython.boundscheck(False)
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
index 2bc6d74fe6aee..555ec73acd9b2 100644
--- a/pandas/_libs/hashtable.pyi
+++ b/pandas/_libs/hashtable.pyi
@@ -20,7 +20,6 @@ class Factorizer:
def factorize(
self,
values: np.ndarray,
- sort: bool = ...,
na_sentinel=...,
na_value=...,
mask=...,
@@ -157,9 +156,9 @@ class HashTable:
def __contains__(self, key: Hashable) -> bool: ...
def sizeof(self, deep: bool = ...) -> int: ...
def get_state(self) -> dict[str, int]: ...
- # TODO: `item` type is subclass-specific
- def get_item(self, item): ... # TODO: return type?
- def set_item(self, item, val) -> None: ...
+ # TODO: `val/key` type is subclass-specific
+ def get_item(self, val): ... # TODO: return type?
+ def set_item(self, key, val) -> None: ...
def get_na(self): ... # TODO: return type?
def set_na(self, val) -> None: ...
def map_locations(
@@ -185,6 +184,7 @@ class HashTable:
self,
values: np.ndarray, # np.ndarray[subclass-specific]
return_inverse: bool = ...,
+ mask=...,
) -> (
tuple[
np.ndarray, # np.ndarray[subclass-specific]
@@ -198,6 +198,7 @@ class HashTable:
na_sentinel: int = ...,
na_value: object = ...,
mask=...,
+ ignore_na: bool = True,
) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific]
class Complex128HashTable(HashTable): ...
@@ -240,7 +241,7 @@ def value_count(
values: np.ndarray,
dropna: bool,
mask: npt.NDArray[np.bool_] | None = ...,
-) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values]
+) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values]
# arr and values should have same dtype
def ismember(
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 1cf5d734705af..c0723392496c1 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1239,9 +1239,10 @@ cdef class StringHashTable(HashTable):
na_value=na_value, ignore_na=ignore_na,
return_inverse=True)
+ # Add unused mask parameter for compat with other signatures
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
+ object na_value=None, object mask=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
@@ -1496,9 +1497,10 @@ cdef class PyObjectHashTable(HashTable):
na_value=na_value, ignore_na=ignore_na,
return_inverse=True)
+ # Add unused mask parameter for compat with other signatures
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
+ object na_value=None, object mask=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
index b9cf6011481af..336af306d410f 100644
--- a/pandas/_libs/hashtable_func_helper.pxi.in
+++ b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
{{endif}}
cdef:
- Py_ssize_t i = 0
+ Py_ssize_t i = 0, na_counter = 0, na_add = 0
Py_ssize_t n = len(values)
kh_{{ttype}}_t *table
@@ -49,9 +49,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
bint uses_mask = mask is not None
bint isna_entry = False
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
# we track the order in which keys are first seen (GH39009),
# khash-map isn't insertion-ordered, thus:
# table maps keys to counts
@@ -82,25 +79,31 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
for i in range(n):
val = {{to_c_type}}(values[i])
+ if uses_mask:
+ isna_entry = mask[i]
+
if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
+ if not uses_mask:
isna_entry = is_nan_{{c_type}}(val)
if not dropna or not isna_entry:
- k = kh_get_{{ttype}}(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
+ if uses_mask and isna_entry:
+ na_counter += 1
else:
- k = kh_put_{{ttype}}(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
+ k = kh_get_{{ttype}}(table, val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_{{ttype}}(table, val, &ret)
+ table.vals[k] = 1
+ result_keys.append(val)
{{endif}}
# collect counts in the order corresponding to result_keys:
+ if na_counter > 0:
+ na_add = 1
cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
+ int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64)
for i in range(table.size):
{{if dtype == 'object'}}
@@ -110,9 +113,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
{{endif}}
result_counts[i] = table.vals[k]
+ if na_counter > 0:
+ result_counts[table.size] = na_counter
+ result_keys.append(val)
+
kh_destroy_{{ttype}}(table)
- return result_keys.to_array(), result_counts.base
+ return result_keys.to_array(), result_counts.base, na_counter
@cython.wraparound(False)
@@ -397,12 +404,13 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
cdef:
ndarray[htfunc_t] keys
ndarray[htfunc_t] modes
+ ndarray[uint8_t] res_mask = None
int64_t[::1] counts
- int64_t count, max_count = -1
- Py_ssize_t nkeys, k, j = 0
+ int64_t count, _, max_count = -1
+ Py_ssize_t nkeys, k, na_counter, j = 0
- keys, counts = value_count(values, dropna, mask=mask)
+ keys, counts, na_counter = value_count(values, dropna, mask=mask)
nkeys = len(keys)
modes = np.empty(nkeys, dtype=values.dtype)
@@ -433,7 +441,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
modes[j] = keys[k]
- return modes[:j + 1]
+ if na_counter > 0:
+ res_mask = np.zeros(j+1, dtype=np.bool_)
+ res_mask[j] = True
+ return modes[:j + 1], res_mask
{{py:
diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h
index a5ad926924dc5..e039991847a62 100644
--- a/pandas/_libs/include/pandas/datetime/date_conversions.h
+++ b/pandas/_libs/include/pandas/datetime/date_conversions.h
@@ -12,19 +12,13 @@ The full license is in the LICENSE file, distributed with this software.
#include
// Scales value inplace from nanosecond resolution to unit resolution
-int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
+int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit);
// Converts an int64 object representing a date to ISO format
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
// len is mutated to save the length of the returned string
-char *int64ToIso(int64_t value,
- NPY_DATETIMEUNIT valueUnit,
- NPY_DATETIMEUNIT base,
- size_t *len);
-
-// TODO(username): this function doesn't do a lot; should augment or
-// replace with scaleNanosecToUnit
-npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base);
+char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit,
+ NPY_DATETIMEUNIT base, size_t *len);
char *int64ToIsoDuration(int64_t value, size_t *len);
diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h
index 3e362deb87807..98e5521af2506 100644
--- a/pandas/_libs/include/pandas/datetime/pd_datetime.h
+++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h
@@ -19,12 +19,12 @@ See NUMPY_LICENSE.txt for the license.
#ifndef NPY_NO_DEPRECATED_API
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
+#endif // NPY_NO_DEPRECATED_API
-#include
+#include "pandas/datetime/date_conversions.h"
#include "pandas/vendored/numpy/datetime/np_datetime.h"
#include "pandas/vendored/numpy/datetime/np_datetime_strings.h"
-#include "pandas/datetime/date_conversions.h"
+#include
#ifdef __cplusplus
extern "C" {
@@ -33,9 +33,8 @@ extern "C" {
typedef struct {
npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT,
const npy_datetimestruct *);
- int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT);
+ int (*scaleNanosecToUnit)(int64_t *, NPY_DATETIMEUNIT);
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *);
- npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT);
char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *);
npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT);
char *(*int64ToIsoDuration)(int64_t, size_t *);
@@ -51,7 +50,7 @@ typedef struct {
NPY_DATETIMEUNIT *, int *, int *, const char *,
int, FormatRequirement);
int (*get_datetime_iso_8601_strlen)(int, NPY_DATETIMEUNIT);
- int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, int, int,
+ int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, size_t, int,
NPY_DATETIMEUNIT);
int (*make_iso_8601_timedelta)(pandas_timedeltastruct *, char *, size_t *);
} PandasDateTime_CAPI;
diff --git a/pandas/_libs/include/pandas/inline_helper.h b/pandas/_libs/include/pandas/inline_helper.h
deleted file mode 100644
index c77da0e52b9d3..0000000000000
--- a/pandas/_libs/include/pandas/inline_helper.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-*/
-
-#pragma once
-
-#ifndef PANDAS_INLINE
- #if defined(__clang__)
- #define PANDAS_INLINE static __inline__ __attribute__ ((__unused__))
- #elif defined(__GNUC__)
- #define PANDAS_INLINE static __inline__
- #elif defined(_MSC_VER)
- #define PANDAS_INLINE static __inline
- #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
- #define PANDAS_INLINE static inline
- #else
- #define PANDAS_INLINE
- #endif // __GNUC__
-#endif // PANDAS_INLINE
diff --git a/pandas/_libs/include/pandas/parser/io.h b/pandas/_libs/include/pandas/parser/io.h
index 9032eb6759358..c707c23b567d2 100644
--- a/pandas/_libs/include/pandas/parser/io.h
+++ b/pandas/_libs/include/pandas/parser/io.h
@@ -10,22 +10,22 @@ The full license is in the LICENSE file, distributed with this software.
#pragma once
#define PY_SSIZE_T_CLEAN
-#include
#include "tokenizer.h"
+#include
#define FS(source) ((file_source *)source)
typedef struct _rd_source {
- PyObject *obj;
- PyObject *buffer;
- size_t position;
+ PyObject *obj;
+ PyObject *buffer;
+ size_t position;
} rd_source;
#define RDS(source) ((rd_source *)source)
void *new_rd_source(PyObject *obj);
-int del_rd_source(void *src);
+void del_rd_source(void *src);
-void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
+char *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors);
diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h
index 1ea94fde593ef..58a09ae1bba39 100644
--- a/pandas/_libs/include/pandas/parser/pd_parser.h
+++ b/pandas/_libs/include/pandas/parser/pd_parser.h
@@ -13,15 +13,15 @@ extern "C" {
#endif
#define PY_SSIZE_T_CLEAN
-#include
#include "pandas/parser/tokenizer.h"
+#include
typedef struct {
int (*to_double)(char *, double *, char, char, int *);
int (*floatify)(PyObject *, double *, int *);
void *(*new_rd_source)(PyObject *);
- int (*del_rd_source)(void *);
- void *(*buffer_rd_bytes)(void *, size_t, size_t *, int *, const char *);
+ void (*del_rd_source)(void *);
+ char *(*buffer_rd_bytes)(void *, size_t, size_t *, int *, const char *);
void (*uint_state_init)(uint_state *);
int (*uint64_conflict)(uint_state *);
void (*coliter_setup)(coliter_t *, parser_t *, int64_t, int64_t);
@@ -30,7 +30,7 @@ typedef struct {
void (*parser_free)(parser_t *);
void (*parser_del)(parser_t *);
int (*parser_add_skiprow)(parser_t *, int64_t);
- int (*parser_set_skipfirstnrows)(parser_t *, int64_t);
+ void (*parser_set_skipfirstnrows)(parser_t *, int64_t);
void (*parser_set_default_options)(parser_t *);
int (*parser_consume_rows)(parser_t *, size_t);
int (*parser_trim_buffers)(parser_t *);
@@ -81,11 +81,10 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
PandasParserAPI->parser_set_default_options((self))
#define parser_consume_rows(self, nrows) \
PandasParserAPI->parser_consume_rows((self), (nrows))
-#define parser_trim_buffers(self) \
- PandasParserAPI->parser_trim_buffers((self))
-#define tokenize_all_rows(self, encoding_errors) \
+#define parser_trim_buffers(self) PandasParserAPI->parser_trim_buffers((self))
+#define tokenize_all_rows(self, encoding_errors) \
PandasParserAPI->tokenize_all_rows((self), (encoding_errors))
-#define tokenize_nrows(self, nrows, encoding_errors) \
+#define tokenize_nrows(self, nrows, encoding_errors) \
PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors))
#define str_to_int64(p_item, int_min, int_max, error, t_sep) \
PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \
@@ -104,7 +103,7 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
PandasParserAPI->round_trip((p), (q), (decimal), (sci), (tsep), \
(skip_trailing), (error), (maybe_int))
#define to_boolean(item, val) PandasParserAPI->to_boolean((item), (val))
-#endif /* !defined(_PANDAS_PARSER_IMPL) */
+#endif /* !defined(_PANDAS_PARSER_IMPL) */
#ifdef __cplusplus
}
diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
index a53d09012116d..209f375a5bf6c 100644
--- a/pandas/_libs/include/pandas/parser/tokenizer.h
+++ b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -19,17 +19,12 @@ See LICENSE for the license
#define ERROR_INVALID_CHARS 3
#include
-#include "pandas/inline_helper.h"
-#include "pandas/portable.h"
-
-#include "pandas/vendored/klib/khash.h"
#define STREAM_INIT_SIZE 32
#define REACHED_EOF 1
#define CALLING_READ_FAILED 2
-
/*
C flat file parsing low level code for pandas / NumPy
@@ -46,7 +41,7 @@ See LICENSE for the license
#define TRACE(X) printf X;
#else
#define TRACE(X)
-#endif // VERBOSE
+#endif // VERBOSE
#define PARSER_OUT_OF_MEMORY -1
@@ -56,131 +51,127 @@ See LICENSE for the license
*/
typedef enum {
- START_RECORD,
- START_FIELD,
- ESCAPED_CHAR,
- IN_FIELD,
- IN_QUOTED_FIELD,
- ESCAPE_IN_QUOTED_FIELD,
- QUOTE_IN_QUOTED_FIELD,
- EAT_CRNL,
- EAT_CRNL_NOP,
- EAT_WHITESPACE,
- EAT_COMMENT,
- EAT_LINE_COMMENT,
- WHITESPACE_LINE,
- START_FIELD_IN_SKIP_LINE,
- IN_FIELD_IN_SKIP_LINE,
- IN_QUOTED_FIELD_IN_SKIP_LINE,
- QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
- FINISHED
+ START_RECORD,
+ START_FIELD,
+ ESCAPED_CHAR,
+ IN_FIELD,
+ IN_QUOTED_FIELD,
+ ESCAPE_IN_QUOTED_FIELD,
+ QUOTE_IN_QUOTED_FIELD,
+ EAT_CRNL,
+ EAT_CRNL_NOP,
+ EAT_WHITESPACE,
+ EAT_COMMENT,
+ EAT_LINE_COMMENT,
+ WHITESPACE_LINE,
+ START_FIELD_IN_SKIP_LINE,
+ IN_FIELD_IN_SKIP_LINE,
+ IN_QUOTED_FIELD_IN_SKIP_LINE,
+ QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
+ FINISHED
} ParserState;
typedef enum {
- QUOTE_MINIMAL,
- QUOTE_ALL,
- QUOTE_NONNUMERIC,
- QUOTE_NONE
+ QUOTE_MINIMAL,
+ QUOTE_ALL,
+ QUOTE_NONNUMERIC,
+ QUOTE_NONE
} QuoteStyle;
-typedef enum {
- ERROR,
- WARN,
- SKIP
-} BadLineHandleMethod;
+typedef enum { ERROR, WARN, SKIP } BadLineHandleMethod;
-typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
+typedef char *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors);
-typedef int (*io_cleanup)(void *src);
+typedef void (*io_cleanup)(void *src);
typedef struct parser_t {
- void *source;
- io_callback cb_io;
- io_cleanup cb_cleanup;
-
- int64_t chunksize; // Number of bytes to prepare for each chunk
- char *data; // pointer to data to be processed
- int64_t datalen; // amount of data available
- int64_t datapos;
-
- // where to write out tokenized data
- char *stream;
- uint64_t stream_len;
- uint64_t stream_cap;
-
- // Store words in (potentially ragged) matrix for now, hmm
- char **words;
- int64_t *word_starts; // where we are in the stream
- uint64_t words_len;
- uint64_t words_cap;
- uint64_t max_words_cap; // maximum word cap encountered
-
- char *pword_start; // pointer to stream start of current field
- int64_t word_start; // position start of current field
-
- int64_t *line_start; // position in words for start of line
- int64_t *line_fields; // Number of fields in each line
- uint64_t lines; // Number of (good) lines observed
- uint64_t file_lines; // Number of lines (including bad or skipped)
- uint64_t lines_cap; // Vector capacity
-
- // Tokenizing stuff
- ParserState state;
- int doublequote; /* is " represented by ""? */
- char delimiter; /* field separator */
- int delim_whitespace; /* delimit by consuming space/tabs instead */
- char quotechar; /* quote character */
- char escapechar; /* escape character */
- char lineterminator;
- int skipinitialspace; /* ignore spaces following delimiter? */
- int quoting; /* style of quoting to write */
-
- char commentchar;
- int allow_embedded_newline;
-
- int usecols; // Boolean: 1: usecols provided, 0: none provided
-
- Py_ssize_t expected_fields;
- BadLineHandleMethod on_bad_lines;
-
- // floating point options
- char decimal;
- char sci;
-
- // thousands separator (comma, period)
- char thousands;
-
- int header; // Boolean: 1: has header, 0: no header
- int64_t header_start; // header row start
- uint64_t header_end; // header row end
-
- void *skipset;
- PyObject *skipfunc;
- int64_t skip_first_N_rows;
- int64_t skip_footer;
- double (*double_converter)(const char *, char **,
- char, char, char, int, int *, int *);
-
- // error handling
- char *warn_msg;
- char *error_msg;
-
- int skip_empty_lines;
+ void *source;
+ io_callback cb_io;
+ io_cleanup cb_cleanup;
+
+ int64_t chunksize; // Number of bytes to prepare for each chunk
+ char *data; // pointer to data to be processed
+ int64_t datalen; // amount of data available
+ int64_t datapos;
+
+ // where to write out tokenized data
+ char *stream;
+ uint64_t stream_len;
+ uint64_t stream_cap;
+
+ // Store words in (potentially ragged) matrix for now, hmm
+ char **words;
+ int64_t *word_starts; // where we are in the stream
+ uint64_t words_len;
+ uint64_t words_cap;
+ uint64_t max_words_cap; // maximum word cap encountered
+
+ char *pword_start; // pointer to stream start of current field
+ int64_t word_start; // position start of current field
+
+ int64_t *line_start; // position in words for start of line
+ int64_t *line_fields; // Number of fields in each line
+ uint64_t lines; // Number of (good) lines observed
+ uint64_t file_lines; // Number of lines (including bad or skipped)
+ uint64_t lines_cap; // Vector capacity
+
+ // Tokenizing stuff
+ ParserState state;
+ int doublequote; /* is " represented by ""? */
+ char delimiter; /* field separator */
+ int delim_whitespace; /* delimit by consuming space/tabs instead */
+ char quotechar; /* quote character */
+ char escapechar; /* escape character */
+ char lineterminator;
+ int skipinitialspace; /* ignore spaces following delimiter? */
+ int quoting; /* style of quoting to write */
+
+ char commentchar;
+ int allow_embedded_newline;
+
+ int usecols; // Boolean: 1: usecols provided, 0: none provided
+
+ Py_ssize_t expected_fields;
+ BadLineHandleMethod on_bad_lines;
+
+ // floating point options
+ char decimal;
+ char sci;
+
+ // thousands separator (comma, period)
+ char thousands;
+
+ int header; // Boolean: 1: has header, 0: no header
+ int64_t header_start; // header row start
+ uint64_t header_end; // header row end
+
+ void *skipset;
+ PyObject *skipfunc;
+ int64_t skip_first_N_rows;
+ int64_t skip_footer;
+ double (*double_converter)(const char *, char **, char, char, char, int,
+ int *, int *);
+
+ // error handling
+ char *warn_msg;
+ char *error_msg;
+
+ int skip_empty_lines;
} parser_t;
typedef struct coliter_t {
- char **words;
- int64_t *line_start;
- int64_t col;
+ char **words;
+ int64_t *line_start;
+ int64_t col;
} coliter_t;
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start);
-#define COLITER_NEXT(iter, word) \
- do { \
- const int64_t i = *iter.line_start++ + iter.col; \
- word = i >= *iter.line_start ? "" : iter.words[i]; \
- } while (0)
+#define COLITER_NEXT(iter, word) \
+ do { \
+ const int64_t i = *iter.line_start++ + iter.col; \
+ word = i >= *iter.line_start ? "" : iter.words[i]; \
+ } while (0)
parser_t *parser_new(void);
@@ -192,7 +183,7 @@ int parser_trim_buffers(parser_t *self);
int parser_add_skiprow(parser_t *self, int64_t row);
-int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
+void parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
void parser_free(parser_t *self);
@@ -208,9 +199,9 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors);
// and want to free memory from the token stream
typedef struct uint_state {
- int seen_sint;
- int seen_uint;
- int seen_null;
+ int seen_sint;
+ int seen_uint;
+ int seen_null;
} uint_state;
void uint_state_init(uint_state *self);
@@ -223,9 +214,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep);
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int);
-double precise_xstrtod(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int);
+double precise_xstrtod(const char *p, char **q, char decimal, char sci,
+ char tsep, int skip_trailing, int *error,
+ int *maybe_int);
// GH-15140 - round_trip requires and acquires the GIL on its own
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h
index 954b5c3cce082..1d0509d9e9724 100644
--- a/pandas/_libs/include/pandas/portable.h
+++ b/pandas/_libs/include/pandas/portable.h
@@ -16,9 +16,22 @@ The full license is in the LICENSE file, distributed with this software.
#endif
// GH-23516 - works around locale perf issues
-// from MUSL libc, MIT Licensed - see LICENSES
+// from MUSL libc, licence at LICENSES/MUSL_LICENSE
#define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u)
-#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default)
+#define getdigit_ascii(c, default) \
+ (isdigit_ascii(c) ? ((int)((c) - '0')) : default)
#define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5))
#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c))
#define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c))
+
+#if defined(_WIN32)
+#define PD_FALLTHROUGH \
+ do { \
+ } while (0) /* fallthrough */
+#elif __has_attribute(__fallthrough__)
+#define PD_FALLTHROUGH __attribute__((__fallthrough__))
+#else
+#define PD_FALLTHROUGH \
+ do { \
+ } while (0) /* fallthrough */
+#endif
diff --git a/pandas/_libs/include/pandas/skiplist.h b/pandas/_libs/include/pandas/skiplist.h
index 3be9e51f42e09..57e304b3a66c0 100644
--- a/pandas/_libs/include/pandas/skiplist.h
+++ b/pandas/_libs/include/pandas/skiplist.h
@@ -19,279 +19,278 @@ Python recipe (https://rhettinger.wordpress.com/2010/02/06/lost-knowledge/)
#include
#include
#include
-#include "pandas/inline_helper.h"
-
-PANDAS_INLINE float __skiplist_nanf(void) {
- const union {
- int __i;
- float __f;
- } __bint = {0x7fc00000UL};
- return __bint.__f;
+
+static inline float __skiplist_nanf(void) {
+ const union {
+ int __i;
+ float __f;
+ } __bint = {0x7fc00000UL};
+ return __bint.__f;
}
#define PANDAS_NAN ((double)__skiplist_nanf())
-PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); }
+static inline double Log2(double val) { return log(val) / log(2.); }
typedef struct node_t node_t;
struct node_t {
- node_t **next;
- int *width;
- double value;
- int is_nil;
- int levels;
- int ref_count;
+ node_t **next;
+ int *width;
+ double value;
+ int is_nil;
+ int levels;
+ int ref_count;
};
typedef struct {
- node_t *head;
- node_t **tmp_chain;
- int *tmp_steps;
- int size;
- int maxlevels;
+ node_t *head;
+ node_t **tmp_chain;
+ int *tmp_steps;
+ int size;
+ int maxlevels;
} skiplist_t;
-PANDAS_INLINE double urand(void) {
- return ((double)rand() + 1) / ((double)RAND_MAX + 2);
+static inline double urand(void) {
+ return ((double)rand() + 1) / ((double)RAND_MAX + 2);
}
-PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; }
-
-PANDAS_INLINE node_t *node_init(double value, int levels) {
- node_t *result;
- result = (node_t *)malloc(sizeof(node_t));
- if (result) {
- result->value = value;
- result->levels = levels;
- result->is_nil = 0;
- result->ref_count = 0;
- result->next = (node_t **)malloc(levels * sizeof(node_t *));
- result->width = (int *)malloc(levels * sizeof(int));
- if (!(result->next && result->width) && (levels != 0)) {
- free(result->next);
- free(result->width);
- free(result);
- return NULL;
- }
+static inline int int_min(int a, int b) { return a < b ? a : b; }
+
+static inline node_t *node_init(double value, int levels) {
+ node_t *result;
+ result = (node_t *)malloc(sizeof(node_t));
+ if (result) {
+ result->value = value;
+ result->levels = levels;
+ result->is_nil = 0;
+ result->ref_count = 0;
+ result->next = (node_t **)malloc(levels * sizeof(node_t *));
+ result->width = (int *)malloc(levels * sizeof(int));
+ if (!(result->next && result->width) && (levels != 0)) {
+ free(result->next);
+ free(result->width);
+ free(result);
+ return NULL;
}
- return result;
+ }
+ return result;
}
// do this ourselves
-PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); }
+static inline void node_incref(node_t *node) { ++(node->ref_count); }
-PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); }
+static inline void node_decref(node_t *node) { --(node->ref_count); }
static void node_destroy(node_t *node) {
- int i;
- if (node) {
- if (node->ref_count <= 1) {
- for (i = 0; i < node->levels; ++i) {
- node_destroy(node->next[i]);
- }
- free(node->next);
- free(node->width);
- // printf("Reference count was 1, freeing\n");
- free(node);
- } else {
- node_decref(node);
- }
- // pretty sure that freeing the struct above will be enough
+ int i;
+ if (node) {
+ if (node->ref_count <= 1) {
+ for (i = 0; i < node->levels; ++i) {
+ node_destroy(node->next[i]);
+ }
+ free(node->next);
+ free(node->width);
+ // printf("Reference count was 1, freeing\n");
+ free(node);
+ } else {
+ node_decref(node);
}
+ // pretty sure that freeing the struct above will be enough
+ }
}
-PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) {
- if (skp) {
- node_destroy(skp->head);
- free(skp->tmp_steps);
- free(skp->tmp_chain);
- free(skp);
- }
+static inline void skiplist_destroy(skiplist_t *skp) {
+ if (skp) {
+ node_destroy(skp->head);
+ free(skp->tmp_steps);
+ free(skp->tmp_chain);
+ free(skp);
+ }
}
-PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) {
- skiplist_t *result;
- node_t *NIL, *head;
- int maxlevels, i;
-
- maxlevels = 1 + Log2((double)expected_size);
- result = (skiplist_t *)malloc(sizeof(skiplist_t));
- if (!result) {
- return NULL;
- }
- result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *));
- result->tmp_steps = (int *)malloc(maxlevels * sizeof(int));
- result->maxlevels = maxlevels;
- result->size = 0;
-
- head = result->head = node_init(PANDAS_NAN, maxlevels);
- NIL = node_init(0.0, 0);
-
- if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) {
- skiplist_destroy(result);
- node_destroy(NIL);
- return NULL;
- }
-
- node_incref(head);
-
- NIL->is_nil = 1;
-
- for (i = 0; i < maxlevels; ++i) {
- head->next[i] = NIL;
- head->width[i] = 1;
- node_incref(NIL);
- }
-
- return result;
+static inline skiplist_t *skiplist_init(int expected_size) {
+ skiplist_t *result;
+ node_t *NIL, *head;
+ int maxlevels, i;
+
+ maxlevels = 1 + Log2((double)expected_size);
+ result = (skiplist_t *)malloc(sizeof(skiplist_t));
+ if (!result) {
+ return NULL;
+ }
+ result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *));
+ result->tmp_steps = (int *)malloc(maxlevels * sizeof(int));
+ result->maxlevels = maxlevels;
+ result->size = 0;
+
+ head = result->head = node_init(PANDAS_NAN, maxlevels);
+ NIL = node_init(0.0, 0);
+
+ if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) {
+ skiplist_destroy(result);
+ node_destroy(NIL);
+ return NULL;
+ }
+
+ node_incref(head);
+
+ NIL->is_nil = 1;
+
+ for (i = 0; i < maxlevels; ++i) {
+ head->next[i] = NIL;
+ head->width[i] = 1;
+ node_incref(NIL);
+ }
+
+ return result;
}
// 1 if left < right, 0 if left == right, -1 if left > right
-PANDAS_INLINE int _node_cmp(node_t *node, double value) {
- if (node->is_nil || node->value > value) {
- return -1;
- } else if (node->value < value) {
- return 1;
- } else {
- return 0;
- }
+static inline int _node_cmp(node_t *node, double value) {
+ if (node->is_nil || node->value > value) {
+ return -1;
+ } else if (node->value < value) {
+ return 1;
+ } else {
+ return 0;
+ }
}
-PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
- node_t *node;
- int level;
-
- if (i < 0 || i >= skp->size) {
- *ret = 0;
- return 0;
- }
-
- node = skp->head;
- ++i;
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- while (node->width[level] <= i) {
- i -= node->width[level];
- node = node->next[level];
- }
+static inline double skiplist_get(skiplist_t *skp, int i, int *ret) {
+ node_t *node;
+ int level;
+
+ if (i < 0 || i >= skp->size) {
+ *ret = 0;
+ return 0;
+ }
+
+ node = skp->head;
+ ++i;
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ while (node->width[level] <= i) {
+ i -= node->width[level];
+ node = node->next[level];
}
+ }
- *ret = 1;
- return node->value;
+ *ret = 1;
+ return node->value;
}
// Returns the lowest rank of all elements with value `value`, as opposed to the
// highest rank returned by `skiplist_insert`.
-PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
- node_t *node;
- int level, rank = 0;
-
- node = skp->head;
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- while (_node_cmp(node->next[level], value) > 0) {
- rank += node->width[level];
- node = node->next[level];
- }
+static inline int skiplist_min_rank(skiplist_t *skp, double value) {
+ node_t *node;
+ int level, rank = 0;
+
+ node = skp->head;
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ while (_node_cmp(node->next[level], value) > 0) {
+ rank += node->width[level];
+ node = node->next[level];
}
+ }
- return rank + 1;
+ return rank + 1;
}
// Returns the rank of the inserted element. When there are duplicates,
// `rank` is the highest of the group, i.e. the 'max' method of
// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html
-PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
- node_t *node, *prevnode, *newnode, *next_at_level;
- int *steps_at_level;
- int size, steps, level, rank = 0;
- node_t **chain;
-
- chain = skp->tmp_chain;
-
- steps_at_level = skp->tmp_steps;
- memset(steps_at_level, 0, skp->maxlevels * sizeof(int));
-
- node = skp->head;
-
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- next_at_level = node->next[level];
- while (_node_cmp(next_at_level, value) >= 0) {
- steps_at_level[level] += node->width[level];
- rank += node->width[level];
- node = next_at_level;
- next_at_level = node->next[level];
- }
- chain[level] = node;
+static inline int skiplist_insert(skiplist_t *skp, double value) {
+ node_t *node, *prevnode, *newnode, *next_at_level;
+ int *steps_at_level;
+ int size, steps, level, rank = 0;
+ node_t **chain;
+
+ chain = skp->tmp_chain;
+
+ steps_at_level = skp->tmp_steps;
+ memset(steps_at_level, 0, skp->maxlevels * sizeof(int));
+
+ node = skp->head;
+
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ next_at_level = node->next[level];
+ while (_node_cmp(next_at_level, value) >= 0) {
+ steps_at_level[level] += node->width[level];
+ rank += node->width[level];
+ node = next_at_level;
+ next_at_level = node->next[level];
}
+ chain[level] = node;
+ }
- size = int_min(skp->maxlevels, 1 - ((int)Log2(urand())));
+ size = int_min(skp->maxlevels, 1 - ((int)Log2(urand())));
- newnode = node_init(value, size);
- if (!newnode) {
- return -1;
- }
- steps = 0;
+ newnode = node_init(value, size);
+ if (!newnode) {
+ return -1;
+ }
+ steps = 0;
- for (level = 0; level < size; ++level) {
- prevnode = chain[level];
- newnode->next[level] = prevnode->next[level];
+ for (level = 0; level < size; ++level) {
+ prevnode = chain[level];
+ newnode->next[level] = prevnode->next[level];
- prevnode->next[level] = newnode;
- node_incref(newnode); // increment the reference count
+ prevnode->next[level] = newnode;
+ node_incref(newnode); // increment the reference count
- newnode->width[level] = prevnode->width[level] - steps;
- prevnode->width[level] = steps + 1;
+ newnode->width[level] = prevnode->width[level] - steps;
+ prevnode->width[level] = steps + 1;
- steps += steps_at_level[level];
- }
+ steps += steps_at_level[level];
+ }
- for (level = size; level < skp->maxlevels; ++level) {
- chain[level]->width[level] += 1;
- }
+ for (level = size; level < skp->maxlevels; ++level) {
+ chain[level]->width[level] += 1;
+ }
- ++(skp->size);
+ ++(skp->size);
- return rank + 1;
+ return rank + 1;
}
-PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {
- int level, size;
- node_t *node, *prevnode, *tmpnode, *next_at_level;
- node_t **chain;
-
- chain = skp->tmp_chain;
- node = skp->head;
-
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- next_at_level = node->next[level];
- while (_node_cmp(next_at_level, value) > 0) {
- node = next_at_level;
- next_at_level = node->next[level];
- }
- chain[level] = node;
- }
+static inline int skiplist_remove(skiplist_t *skp, double value) {
+ int level, size;
+ node_t *node, *prevnode, *tmpnode, *next_at_level;
+ node_t **chain;
+
+ chain = skp->tmp_chain;
+ node = skp->head;
- if (value != chain[0]->next[0]->value) {
- return 0;
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ next_at_level = node->next[level];
+ while (_node_cmp(next_at_level, value) > 0) {
+ node = next_at_level;
+ next_at_level = node->next[level];
}
+ chain[level] = node;
+ }
- size = chain[0]->next[0]->levels;
+ if (value != chain[0]->next[0]->value) {
+ return 0;
+ }
- for (level = 0; level < size; ++level) {
- prevnode = chain[level];
+ size = chain[0]->next[0]->levels;
- tmpnode = prevnode->next[level];
+ for (level = 0; level < size; ++level) {
+ prevnode = chain[level];
- prevnode->width[level] += tmpnode->width[level] - 1;
- prevnode->next[level] = tmpnode->next[level];
+ tmpnode = prevnode->next[level];
- tmpnode->next[level] = NULL;
- node_destroy(tmpnode); // decrement refcount or free
- }
+ prevnode->width[level] += tmpnode->width[level] - 1;
+ prevnode->next[level] = tmpnode->next[level];
- for (level = size; level < skp->maxlevels; ++level) {
- --(chain[level]->width[level]);
- }
+ tmpnode->next[level] = NULL;
+ node_destroy(tmpnode); // decrement refcount or free
+ }
- --(skp->size);
- return 1;
+ for (level = size; level < skp->maxlevels; ++level) {
+ --(chain[level]->width[level]);
+ }
+
+ --(skp->size);
+ return 1;
}
diff --git a/pandas/_libs/include/pandas/vendored/klib/khash.h b/pandas/_libs/include/pandas/vendored/klib/khash.h
index 95b25d053a9df..f072106e09596 100644
--- a/pandas/_libs/include/pandas/vendored/klib/khash.h
+++ b/pandas/_libs/include/pandas/vendored/klib/khash.h
@@ -1,27 +1,4 @@
-/* The MIT License
-
- Copyright (c) 2008, 2009, 2011 by Attractive Chaos
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
+// Licence at LICENSES/KLIB_LICENSE
/*
An example:
@@ -29,38 +6,38 @@
#include "khash.h"
KHASH_MAP_INIT_INT(32, char)
int main() {
- int ret, is_missing;
- khiter_t k;
- khash_t(32) *h = kh_init(32);
- k = kh_put(32, h, 5, &ret);
- if (!ret) kh_del(32, h, k);
- kh_value(h, k) = 10;
- k = kh_get(32, h, 10);
- is_missing = (k == kh_end(h));
- k = kh_get(32, h, 5);
- kh_del(32, h, k);
- for (k = kh_begin(h); k != kh_end(h); ++k)
- if (kh_exist(h, k)) kh_value(h, k) = 1;
- kh_destroy(32, h);
- return 0;
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ if (!ret) kh_del(32, h, k);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
}
*/
/*
2011-09-16 (0.2.6):
- * The capacity is a power of 2. This seems to dramatically improve the
- speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
- - https://github.com/stefanocasazza/ULib
- - https://nothings.org/computer/judy/
+ - https://github.com/stefanocasazza/ULib
+ - https://nothings.org/computer/judy/
- * Allow to optionally use linear probing which usually has better
- performance for random input. Double hashing is still the default as it
- is more robust to certain non-random input.
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as
+ it is more robust to certain non-random input.
- * Added Wang's integer hash function (not used by default). This hash
- function is more robust to certain non-random input.
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
2011-02-14 (0.2.5):
@@ -72,32 +49,31 @@ int main() {
2008-09-19 (0.2.3):
- * Corrected the example
- * Improved interfaces
+ * Corrected the example
+ * Improved interfaces
2008-09-11 (0.2.2):
- * Improved speed a little in kh_put()
+ * Improved speed a little in kh_put()
2008-09-10 (0.2.1):
- * Added kh_clear()
- * Fixed a compiling error
+ * Added kh_clear()
+ * Fixed a compiling error
2008-09-02 (0.2.0):
- * Changed to token concatenation which increases flexibility.
+ * Changed to token concatenation which increases flexibility.
2008-08-31 (0.1.2):
- * Fixed a bug in kh_get(), which has not been tested previously.
+ * Fixed a bug in kh_get(), which has not been tested previously.
2008-08-31 (0.1.1):
- * Added destructor
+ * Added destructor
*/
-
#ifndef __AC_KHASH_H
#define __AC_KHASH_H
@@ -109,11 +85,9 @@ int main() {
#define AC_VERSION_KHASH_H "0.2.6"
+#include
#include
#include
-#include
-#include "pandas/inline_helper.h"
-
// hooks for memory allocator, C-runtime allocator used per default
#ifndef KHASH_MALLOC
@@ -132,7 +106,6 @@ int main() {
#define KHASH_FREE free
#endif
-
#if UINT_MAX == 0xffffffffu
typedef unsigned int khuint32_t;
typedef signed int khint32_t;
@@ -168,262 +141,311 @@ typedef float khfloat32_t;
typedef khuint32_t khuint_t;
typedef khuint_t khiter_t;
-#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1)
+#define __ac_isempty(flag, i) ((flag[i >> 5] >> (i & 0x1fU)) & 1)
#define __ac_isdel(flag, i) (0)
#define __ac_iseither(flag, i) __ac_isempty(flag, i)
#define __ac_set_isdel_false(flag, i) (0)
-#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU)))
-#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU)))
+#define __ac_set_isempty_false(flag, i) (flag[i >> 5] &= ~(1ul << (i & 0x1fU)))
+#define __ac_set_isempty_true(flag, i) (flag[i >> 5] |= (1ul << (i & 0x1fU)))
#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
#define __ac_set_isdel_true(flag, i) ((void)0)
-
-// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
-khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){
- const khuint32_t SEED = 0xc70f6907UL;
- // 'm' and 'r' are mixing constants generated offline.
- // They're not really 'magic', they just happen to work well.
- const khuint32_t M_32 = 0x5bd1e995;
- const int R_32 = 24;
-
- // Initialize the hash to a 'random' value
- khuint32_t h = SEED ^ 4;
-
- //handle 4 bytes:
- k *= M_32;
- k ^= k >> R_32;
- k *= M_32;
-
- h *= M_32;
- h ^= k;
-
- // Do a few final mixes of the hash to ensure the "last few
- // bytes" are well-incorporated. (Really needed here?)
- h ^= h >> 13;
- h *= M_32;
- h ^= h >> 15;
- return h;
+// specializations of
+// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
+static inline khuint32_t murmur2_32to32(khuint32_t k) {
+ const khuint32_t SEED = 0xc70f6907UL;
+ // 'm' and 'r' are mixing constants generated offline.
+ // They're not really 'magic', they just happen to work well.
+ const khuint32_t M_32 = 0x5bd1e995;
+ const int R_32 = 24;
+
+ // Initialize the hash to a 'random' value
+ khuint32_t h = SEED ^ 4;
+
+ // handle 4 bytes:
+ k *= M_32;
+ k ^= k >> R_32;
+ k *= M_32;
+
+ h *= M_32;
+ h ^= k;
+
+ // Do a few final mixes of the hash to ensure the "last few
+ // bytes" are well-incorporated. (Really needed here?)
+ h ^= h >> 13;
+ h *= M_32;
+ h ^= h >> 15;
+ return h;
}
-// it is possible to have a special x64-version, which would need less operations, but
-// using 32bit version always has also some benefits:
+// it is possible to have a special x64-version, which would need less
+// operations, but using 32bit version always has also some benefits:
// - one code for 32bit and 64bit builds
// - the same case for 32bit and 64bit builds
-// - no performance difference could be measured compared to a possible x64-version
-
-khuint32_t PANDAS_INLINE murmur2_32_32to32(khuint32_t k1, khuint32_t k2){
- const khuint32_t SEED = 0xc70f6907UL;
- // 'm' and 'r' are mixing constants generated offline.
- // They're not really 'magic', they just happen to work well.
- const khuint32_t M_32 = 0x5bd1e995;
- const int R_32 = 24;
-
- // Initialize the hash to a 'random' value
- khuint32_t h = SEED ^ 4;
-
- //handle first 4 bytes:
- k1 *= M_32;
- k1 ^= k1 >> R_32;
- k1 *= M_32;
-
- h *= M_32;
- h ^= k1;
-
- //handle second 4 bytes:
- k2 *= M_32;
- k2 ^= k2 >> R_32;
- k2 *= M_32;
-
- h *= M_32;
- h ^= k2;
-
- // Do a few final mixes of the hash to ensure the "last few
- // bytes" are well-incorporated.
- h ^= h >> 13;
- h *= M_32;
- h ^= h >> 15;
- return h;
+// - no performance difference could be measured compared to a possible
+// x64-version
+
+static inline khuint32_t murmur2_32_32to32(khuint32_t k1, khuint32_t k2) {
+ const khuint32_t SEED = 0xc70f6907UL;
+ // 'm' and 'r' are mixing constants generated offline.
+ // They're not really 'magic', they just happen to work well.
+ const khuint32_t M_32 = 0x5bd1e995;
+ const int R_32 = 24;
+
+ // Initialize the hash to a 'random' value
+ khuint32_t h = SEED ^ 4;
+
+ // handle first 4 bytes:
+ k1 *= M_32;
+ k1 ^= k1 >> R_32;
+ k1 *= M_32;
+
+ h *= M_32;
+ h ^= k1;
+
+ // handle second 4 bytes:
+ k2 *= M_32;
+ k2 ^= k2 >> R_32;
+ k2 *= M_32;
+
+ h *= M_32;
+ h ^= k2;
+
+ // Do a few final mixes of the hash to ensure the "last few
+ // bytes" are well-incorporated.
+ h ^= h >> 13;
+ h *= M_32;
+ h ^= h >> 15;
+ return h;
}
-khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){
- khuint32_t k1 = (khuint32_t)k;
- khuint32_t k2 = (khuint32_t)(k >> 32);
+static inline khuint32_t murmur2_64to32(khuint64_t k) {
+ khuint32_t k1 = (khuint32_t)k;
+ khuint32_t k2 = (khuint32_t)(k >> 32);
- return murmur2_32_32to32(k1, k2);
+ return murmur2_32_32to32(k1, k2);
}
-
#ifdef KHASH_LINEAR
#define __ac_inc(k, m) 1
#else
#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
#endif
-#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
+#define __ac_fsize(m) ((m) < 32 ? 1 : (m) >> 5)
#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#define kroundup32(x) \
+ (--(x), (x) |= (x) >> 1, (x) |= (x) >> 2, (x) |= (x) >> 4, (x) |= (x) >> 8, \
+ (x) |= (x) >> 16, ++(x))
#endif
static const double __ac_HASH_UPPER = 0.77;
-#define KHASH_DECLARE(name, khkey_t, khval_t) \
- typedef struct { \
- khuint_t n_buckets, size, n_occupied, upper_bound; \
- khuint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- extern kh_##name##_t *kh_init_##name(); \
- extern void kh_destroy_##name(kh_##name##_t *h); \
- extern void kh_clear_##name(kh_##name##_t *h); \
- extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
- extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \
- extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
- extern void kh_del_##name(kh_##name##_t *h, khuint_t x);
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- typedef struct { \
- khuint_t n_buckets, size, n_occupied, upper_bound; \
- khuint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- SCOPE kh_##name##_t *kh_init_##name(void) { \
- return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \
- } \
- SCOPE void kh_destroy_##name(kh_##name##_t *h) \
- { \
- if (h) { \
- KHASH_FREE(h->keys); KHASH_FREE(h->flags); \
- KHASH_FREE(h->vals); \
- KHASH_FREE(h); \
- } \
- } \
- SCOPE void kh_clear_##name(kh_##name##_t *h) \
- { \
- if (h && h->flags) { \
- memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \
- h->size = h->n_occupied = 0; \
- } \
- } \
- SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
- { \
- if (h->n_buckets) { \
- khuint_t inc, k, i, last, mask; \
- mask = h->n_buckets - 1; \
- k = __hash_func(key); i = k & mask; \
- inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- i = (i + inc) & mask; \
- if (i == last) return h->n_buckets; \
- } \
- return __ac_iseither(h->flags, i)? h->n_buckets : i; \
- } else return 0; \
- } \
- SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \
- { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
- khuint32_t *new_flags = 0; \
- khuint_t j = 1; \
- { \
- kroundup32(new_n_buckets); \
- if (new_n_buckets < 4) new_n_buckets = 4; \
- if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
- else { /* hash table size to be changed (shrink or expand); rehash */ \
- new_flags = (khuint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
- memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
- if (h->n_buckets < new_n_buckets) { /* expand */ \
- h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
- } /* otherwise shrink */ \
- } \
- } \
- if (j) { /* rehashing is needed */ \
- for (j = 0; j != h->n_buckets; ++j) { \
- if (__ac_iseither(h->flags, j) == 0) { \
- khkey_t key = h->keys[j]; \
- khval_t val; \
- khuint_t new_mask; \
- new_mask = new_n_buckets - 1; \
- if (kh_is_map) val = h->vals[j]; \
- __ac_set_isempty_true(h->flags, j); \
- while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
- khuint_t inc, k, i; \
- k = __hash_func(key); \
- i = k & new_mask; \
- inc = __ac_inc(k, new_mask); \
- while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
- __ac_set_isempty_false(new_flags, i); \
- if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
- { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
- if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
- __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \
- } else { /* write the element and jump out of the loop */ \
- h->keys[i] = key; \
- if (kh_is_map) h->vals[i] = val; \
- break; \
- } \
- } \
- } \
- } \
- if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
- h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
- KHASH_FREE(h->flags); /* free the working space */ \
- h->flags = new_flags; \
- h->n_buckets = new_n_buckets; \
- h->n_occupied = h->size; \
- h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
- } \
- } \
- SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
- { \
- khuint_t x; \
- if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
- if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
- else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
- } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
- { \
- khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
- x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
- if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
- else { \
- inc = __ac_inc(k, mask); last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (__ac_isdel(h->flags, i)) site = i; \
- i = (i + inc) & mask; \
- if (i == last) { x = site; break; } \
- } \
- if (x == h->n_buckets) { \
- if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
- else x = i; \
- } \
- } \
- } \
- if (__ac_isempty(h->flags, x)) { /* not present at all */ \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; ++h->n_occupied; \
- *ret = 1; \
- } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; \
- *ret = 2; \
- } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
- return x; \
- } \
- SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \
- { \
- if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
- __ac_set_isdel_true(h->flags, x); \
- --h->size; \
- } \
- }
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ typedef struct { \
+ khuint_t n_buckets, size, n_occupied, upper_bound; \
+ khuint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ extern kh_##name##_t *kh_init_##name(); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \
+ extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khuint_t x);
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, \
+ __hash_equal) \
+ typedef struct { \
+ khuint_t n_buckets, size, n_occupied, upper_bound; \
+ khuint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ SCOPE kh_##name##_t *kh_init_##name(void) { \
+ return (kh_##name##_t *)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) { \
+ if (h) { \
+ KHASH_FREE(h->keys); \
+ KHASH_FREE(h->flags); \
+ KHASH_FREE(h->vals); \
+ KHASH_FREE(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) { \
+ if (h->n_buckets) { \
+ khuint_t inc, k, i, last, mask; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); \
+ i = k & mask; \
+ inc = __ac_inc(k, mask); \
+ last = i; /* inc==1 for linear probing */ \
+ while (!__ac_isempty(h->flags, i) && \
+ (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ i = (i + inc) & mask; \
+ if (i == last) \
+ return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i) ? h->n_buckets : i; \
+ } else \
+ return 0; \
+ } \
+ SCOPE void kh_resize_##name( \
+ kh_##name##_t *h, \
+ khuint_t new_n_buckets) { /* This function uses 0.25*n_bucktes bytes of \
+ working space instead of \
+ [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+ khuint32_t *new_flags = 0; \
+ khuint_t j = 1; \
+ { \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) \
+ new_n_buckets = 4; \
+ if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) \
+ j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khuint32_t *)KHASH_MALLOC(__ac_fsize(new_n_buckets) * \
+ sizeof(khuint32_t)); \
+ memset(new_flags, 0xff, \
+ __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ h->keys = (khkey_t *)KHASH_REALLOC(h->keys, \
+ new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) \
+ h->vals = (khval_t *)KHASH_REALLOC(h->vals, new_n_buckets * \
+ sizeof(khval_t)); \
+ } /* otherwise shrink */ \
+ } \
+ } \
+ if (j) { /* rehashing is needed */ \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ khuint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
+ if (kh_is_map) \
+ val = h->vals[j]; \
+ __ac_set_isempty_true(h->flags, j); \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+ khuint_t inc, k, i; \
+ k = __hash_func(key); \
+ i = k & new_mask; \
+ inc = __ac_inc(k, new_mask); \
+ while (!__ac_isempty(new_flags, i)) \
+ i = (i + inc) & new_mask; \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && \
+ __ac_iseither(h->flags, i) == \
+ 0) { /* kick out the existing element */ \
+ { \
+ khkey_t tmp = h->keys[i]; \
+ h->keys[i] = key; \
+ key = tmp; \
+ } \
+ if (kh_is_map) { \
+ khval_t tmp = h->vals[i]; \
+ h->vals[i] = val; \
+ val = tmp; \
+ } \
+ __ac_set_isempty_true( \
+ h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
+ h->keys[i] = key; \
+ if (kh_is_map) \
+ h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t *)KHASH_REALLOC(h->keys, \
+ new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) \
+ h->vals = (khval_t *)KHASH_REALLOC(h->vals, \
+ new_n_buckets * sizeof(khval_t)); \
+ } \
+ KHASH_FREE(h->flags); /* free the working space */ \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ } \
+ SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) { \
+ khuint_t x; \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size << 1)) \
+ kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
+ else \
+ kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
+ } /* TODO: to implement automatically shrinking; resize() already support \
+ shrinking */ \
+ { \
+ khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
+ x = site = h->n_buckets; \
+ k = __hash_func(key); \
+ i = k & mask; \
+ if (__ac_isempty(h->flags, i)) \
+ x = i; /* for speed up */ \
+ else { \
+ inc = __ac_inc(k, mask); \
+ last = i; \
+ while (!__ac_isempty(h->flags, i) && \
+ (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) \
+ site = i; \
+ i = (i + inc) & mask; \
+ if (i == last) { \
+ x = site; \
+ break; \
+ } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) \
+ x = site; \
+ else \
+ x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else \
+ *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, \
+ __hash_equal) \
+ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, \
+ __hash_equal)
/* --- BEGIN OF HASH FUNCTIONS --- */
@@ -442,9 +464,8 @@ static const double __ac_HASH_UPPER = 0.77;
@param key The integer [khuint64_t]
@return The hash value [khuint_t]
*/
-PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key)
-{
- return (khuint_t)((key)>>33^(key)^(key)<<11);
+static inline khuint_t kh_int64_hash_func(khuint64_t key) {
+ return (khuint_t)((key) >> 33 ^ (key) ^ (key) << 11);
}
/*! @function
@abstract 64-bit integer comparison function
@@ -456,11 +477,12 @@ PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key)
@param s Pointer to a null terminated string
@return The hash value
*/
-PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s)
-{
- khuint_t h = *s;
- if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
- return h;
+static inline khuint_t __ac_X31_hash_string(const char *s) {
+ khuint_t h = *s;
+ if (h)
+ for (++s; *s; ++s)
+ h = (h << 5) - h + *s;
+ return h;
}
/*! @function
@abstract Another interface to const char* hash function
@@ -473,15 +495,14 @@ PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s)
*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key)
-{
- key += ~(key << 15);
- key ^= (key >> 10);
- key += (key << 3);
- key ^= (key >> 6);
- key += ~(key << 11);
- key ^= (key >> 16);
- return key;
+static inline khuint_t __ac_Wang_hash(khuint_t key) {
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
}
#define kh_int_hash_func2(k) __ac_Wang_hash((khuint_t)key)
@@ -531,7 +552,7 @@ PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key)
@param k Key [type of keys]
@param r Extra return code: 0 if the key is present in the hash table;
1 if the bucket is empty (never used); 2 if the element in
- the bucket has been deleted [int*]
+ the bucket has been deleted [int*]
@return Iterator to the inserted element [khuint_t]
*/
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
@@ -541,7 +562,8 @@ PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key)
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Key [type of keys]
- @return Iterator to the found element, or kh_end(h) is the element is absent [khuint_t]
+ @return Iterator to the found element, or kh_end(h) is the element is
+ absent [khuint_t]
*/
#define kh_get(name, h, k) kh_get_##name(h, k)
@@ -617,81 +639,80 @@ PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key)
@abstract Instantiate a hash set containing integer keys
@param name Name of the hash table [symbol]
*/
-#define KHASH_SET_INIT_INT(name) \
- KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
-#define KHASH_MAP_INIT_INT(name, khval_t) \
- KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-#define KHASH_MAP_INIT_UINT(name, khval_t) \
- KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+#define KHASH_MAP_INIT_UINT(name, khval_t) \
+ KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
*/
-#define KHASH_SET_INIT_UINT64(name) \
- KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+#define KHASH_SET_INIT_UINT64(name) \
+ KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-#define KHASH_SET_INIT_INT64(name) \
- KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
-#define KHASH_MAP_INIT_UINT64(name, khval_t) \
- KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+#define KHASH_MAP_INIT_UINT64(name, khval_t) \
+ KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, \
+ kh_int64_hash_equal)
-#define KHASH_MAP_INIT_INT64(name, khval_t) \
- KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, \
+ kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 16bit-integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
-#define KHASH_MAP_INIT_INT16(name, khval_t) \
- KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+#define KHASH_MAP_INIT_INT16(name, khval_t) \
+ KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-#define KHASH_MAP_INIT_UINT16(name, khval_t) \
- KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+#define KHASH_MAP_INIT_UINT16(name, khval_t) \
+ KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 8bit-integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
-#define KHASH_MAP_INIT_INT8(name, khval_t) \
- KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-#define KHASH_MAP_INIT_UINT8(name, khval_t) \
- KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
+#define KHASH_MAP_INIT_INT8(name, khval_t) \
+ KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+#define KHASH_MAP_INIT_UINT8(name, khval_t) \
+ KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
typedef const char *kh_cstr_t;
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
*/
-#define KHASH_SET_INIT_STR(name) \
- KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
-#define KHASH_MAP_INIT_STR(name, khval_t) \
- KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
#define kh_exist_str(h, k) (kh_exist(h, k))
#define kh_exist_float64(h, k) (kh_exist(h, k))
@@ -715,5 +736,4 @@ KHASH_MAP_INIT_UINT16(uint16, size_t)
KHASH_MAP_INIT_INT8(int8, size_t)
KHASH_MAP_INIT_UINT8(uint8, size_t)
-
#endif /* __AC_KHASH_H */
diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h
index 76b212a4b4660..5a933b45d9e21 100644
--- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h
+++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h
@@ -1,18 +1,17 @@
-#include
-#include
+// Licence at LICENSES/KLIB_LICENSE
+#include
+#include
typedef struct {
- float real;
- float imag;
+ float real;
+ float imag;
} khcomplex64_t;
typedef struct {
- double real;
- double imag;
+ double real;
+ double imag;
} khcomplex128_t;
-
-
// khash should report usage to tracemalloc
#if PY_VERSION_HEX >= 0x03060000
#include
@@ -25,43 +24,41 @@ typedef struct {
#define PyTraceMalloc_Untrack(...)
#endif
-
static const int KHASH_TRACE_DOMAIN = 424242;
-void *traced_malloc(size_t size){
- void * ptr = malloc(size);
- if(ptr!=NULL){
- PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
- }
- return ptr;
+void *traced_malloc(size_t size) {
+ void *ptr = malloc(size);
+ if (ptr != NULL) {
+ PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
+ }
+ return ptr;
}
-void *traced_calloc(size_t num, size_t size){
- void * ptr = calloc(num, size);
- if(ptr!=NULL){
- PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size);
- }
- return ptr;
+void *traced_calloc(size_t num, size_t size) {
+ void *ptr = calloc(num, size);
+ if (ptr != NULL) {
+ PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num * size);
+ }
+ return ptr;
}
-void *traced_realloc(void* old_ptr, size_t size){
- void * ptr = realloc(old_ptr, size);
- if(ptr!=NULL){
- if(old_ptr != ptr){
- PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
- }
- PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
+void *traced_realloc(void *old_ptr, size_t size) {
+ void *ptr = realloc(old_ptr, size);
+ if (ptr != NULL) {
+ if (old_ptr != ptr) {
+ PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
}
- return ptr;
+ PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
+ }
+ return ptr;
}
-void traced_free(void* ptr){
- if(ptr!=NULL){
- PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr);
- }
- free(ptr);
+void traced_free(void *ptr) {
+ if (ptr != NULL) {
+ PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr);
+ }
+ free(ptr);
}
-
#define KHASH_MALLOC traced_malloc
#define KHASH_REALLOC traced_realloc
#define KHASH_CALLOC traced_calloc
@@ -72,327 +69,295 @@ void traced_free(void* ptr){
// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
-// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
-// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
-// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
-// is 64 bits the truncation causes collision issues. Given all that, we use our own
-// simple hash, viewing the double bytes as an int64 and using khash's default
-// hash for 64 bit integers.
-// GH 13436 showed that _Py_HashDouble doesn't work well with khash
-// GH 28303 showed, that the simple xoring-version isn't good enough
-// See GH 36729 for evaluation of the currently used murmur2-hash version
-// An interesting alternative to expensive murmur2-hash would be to change
-// the probing strategy and use e.g. the probing strategy from CPython's
+// The python 3 hash function has the invariant hash(x) == hash(int(x)) ==
+// hash(decimal(x)) and the size of hash may be different by platform / version
+// (long in py2, Py_ssize_t in py3). We don't need those invariants because
+// types will be cast before hashing, and if Py_ssize_t is 64 bits the
+// truncation causes collision issues. Given all that, we use our own simple
+// hash, viewing the double bytes as an int64 and using khash's default hash for
+// 64 bit integers. GH 13436 showed that _Py_HashDouble doesn't work well with
+// khash GH 28303 showed, that the simple xoring-version isn't good enough See
+// GH 36729 for evaluation of the currently used murmur2-hash version An
+// interesting alternative to expensive murmur2-hash would be to change the
+// probing strategy and use e.g. the probing strategy from CPython's
// implementation of dicts, which shines for smaller sizes but is more
// predisposed to superlinear running times (see GH 36729 for comparison)
-
-khuint64_t PANDAS_INLINE asuint64(double key) {
- khuint64_t val;
- memcpy(&val, &key, sizeof(double));
- return val;
+static inline khuint64_t asuint64(double key) {
+ khuint64_t val;
+ memcpy(&val, &key, sizeof(double));
+ return val;
}
-khuint32_t PANDAS_INLINE asuint32(float key) {
- khuint32_t val;
- memcpy(&val, &key, sizeof(float));
- return val;
+static inline khuint32_t asuint32(float key) {
+ khuint32_t val;
+ memcpy(&val, &key, sizeof(float));
+ return val;
}
#define ZERO_HASH 0
-#define NAN_HASH 0
-
-khuint32_t PANDAS_INLINE kh_float64_hash_func(double val){
- // 0.0 and -0.0 should have the same hash:
- if (val == 0.0){
- return ZERO_HASH;
- }
- // all nans should have the same hash:
- if ( val!=val ){
- return NAN_HASH;
- }
- khuint64_t as_int = asuint64(val);
- return murmur2_64to32(as_int);
+#define NAN_HASH 0
+
+static inline khuint32_t kh_float64_hash_func(double val) {
+ // 0.0 and -0.0 should have the same hash:
+ if (val == 0.0) {
+ return ZERO_HASH;
+ }
+ // all nans should have the same hash:
+ if (val != val) {
+ return NAN_HASH;
+ }
+ khuint64_t as_int = asuint64(val);
+ return murmur2_64to32(as_int);
}
-khuint32_t PANDAS_INLINE kh_float32_hash_func(float val){
- // 0.0 and -0.0 should have the same hash:
- if (val == 0.0f){
- return ZERO_HASH;
- }
- // all nans should have the same hash:
- if ( val!=val ){
- return NAN_HASH;
- }
- khuint32_t as_int = asuint32(val);
- return murmur2_32to32(as_int);
+static inline khuint32_t kh_float32_hash_func(float val) {
+ // 0.0 and -0.0 should have the same hash:
+ if (val == 0.0f) {
+ return ZERO_HASH;
+ }
+ // all nans should have the same hash:
+ if (val != val) {
+ return NAN_HASH;
+ }
+ khuint32_t as_int = asuint32(val);
+ return murmur2_32to32(as_int);
}
#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
-#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
- KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal)
+#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
+ KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, \
+ kh_floats_hash_equal)
KHASH_MAP_INIT_FLOAT64(float64, size_t)
-#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \
- KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal)
+#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \
+ KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, \
+ kh_floats_hash_equal)
KHASH_MAP_INIT_FLOAT32(float32, size_t)
-khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){
- return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag);
+static inline khint32_t kh_complex128_hash_func(khcomplex128_t val) {
+ return kh_float64_hash_func(val.real) ^ kh_float64_hash_func(val.imag);
}
-khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){
- return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag);
+static inline khint32_t kh_complex64_hash_func(khcomplex64_t val) {
+ return kh_float32_hash_func(val.real) ^ kh_float32_hash_func(val.imag);
}
-#define kh_complex_hash_equal(a, b) \
+#define kh_complex_hash_equal(a, b) \
(kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag))
-
-#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \
- KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal)
+#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \
+ KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, \
+ kh_complex_hash_equal)
KHASH_MAP_INIT_COMPLEX64(complex64, size_t)
-
-#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \
- KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal)
+#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \
+ KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, \
+ kh_complex_hash_equal)
KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
-
#define kh_exist_complex64(h, k) (kh_exist(h, k))
#define kh_exist_complex128(h, k) (kh_exist(h, k))
-
// NaN-floats should be in the same equivalency class, see GH 22119
-int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){
- return (
- Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
- Py_IS_NAN(PyFloat_AS_DOUBLE(b))
- )
- ||
- ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) );
+static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) {
+ return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) ||
+ (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b));
}
-
// NaNs should be in the same equivalency class, see GH 41836
// PyObject_RichCompareBool for complexobjects has a different behavior
// needs to be replaced
-int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){
- return (
- Py_IS_NAN(a->cval.real) &&
- Py_IS_NAN(b->cval.real) &&
- Py_IS_NAN(a->cval.imag) &&
- Py_IS_NAN(b->cval.imag)
- )
- ||
- (
- Py_IS_NAN(a->cval.real) &&
- Py_IS_NAN(b->cval.real) &&
- a->cval.imag == b->cval.imag
- )
- ||
- (
- a->cval.real == b->cval.real &&
- Py_IS_NAN(a->cval.imag) &&
- Py_IS_NAN(b->cval.imag)
- )
- ||
- (
- a->cval.real == b->cval.real &&
- a->cval.imag == b->cval.imag
- );
+static inline int complexobject_cmp(PyComplexObject *a, PyComplexObject *b) {
+ return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) &&
+ Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) ||
+ (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) &&
+ a->cval.imag == b->cval.imag) ||
+ (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) &&
+ Py_IS_NAN(b->cval.imag)) ||
+ (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag);
}
-int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b);
-
+static inline int pyobject_cmp(PyObject *a, PyObject *b);
// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN),
// which treats NaNs as equivalent
// see GH 41836
-int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
- Py_ssize_t i;
+static inline int tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) {
+ Py_ssize_t i;
- if (Py_SIZE(a) != Py_SIZE(b)) {
- return 0;
- }
+ if (Py_SIZE(a) != Py_SIZE(b)) {
+ return 0;
+ }
- for (i = 0; i < Py_SIZE(a); ++i) {
- if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
- return 0;
- }
+ for (i = 0; i < Py_SIZE(a); ++i) {
+ if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
+ return 0;
}
- return 1;
+ }
+ return 1;
}
-
-int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
- if (a == b) {
- return 1;
+static inline int pyobject_cmp(PyObject *a, PyObject *b) {
+ if (a == b) {
+ return 1;
+ }
+ if (Py_TYPE(a) == Py_TYPE(b)) {
+ // special handling for some built-in types which could have NaNs
+ // as we would like to have them equivalent, but the usual
+ // PyObject_RichCompareBool would return False
+ if (PyFloat_CheckExact(a)) {
+ return floatobject_cmp((PyFloatObject *)a, (PyFloatObject *)b);
+ }
+ if (PyComplex_CheckExact(a)) {
+ return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b);
}
- if (Py_TYPE(a) == Py_TYPE(b)) {
- // special handling for some built-in types which could have NaNs
- // as we would like to have them equivalent, but the usual
- // PyObject_RichCompareBool would return False
- if (PyFloat_CheckExact(a)) {
- return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b);
- }
- if (PyComplex_CheckExact(a)) {
- return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b);
- }
- if (PyTuple_CheckExact(a)) {
- return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b);
- }
- // frozenset isn't yet supported
+ if (PyTuple_CheckExact(a)) {
+ return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b);
}
+ // frozenset isn't yet supported
+ }
- int result = PyObject_RichCompareBool(a, b, Py_EQ);
- if (result < 0) {
- PyErr_Clear();
- return 0;
- }
- return result;
+ int result = PyObject_RichCompareBool(a, b, Py_EQ);
+ if (result < 0) {
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
-
-Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
- //Since Python3.10, nan is no longer has hash 0
- if (Py_IS_NAN(val)) {
- return 0;
- }
+static inline Py_hash_t _Pandas_HashDouble(double val) {
+ // Since Python3.10, nan is no longer has hash 0
+ if (Py_IS_NAN(val)) {
+ return 0;
+ }
#if PY_VERSION_HEX < 0x030A0000
- return _Py_HashDouble(val);
+ return _Py_HashDouble(val);
#else
- return _Py_HashDouble(NULL, val);
+ return _Py_HashDouble(NULL, val);
#endif
}
-
-Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
- return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
+static inline Py_hash_t floatobject_hash(PyFloatObject *key) {
+ return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
}
-
#define _PandasHASH_IMAG 1000003UL
// replaces _Py_HashDouble with _Pandas_HashDouble
-Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
- Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
- Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
- if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
- return -1;
- }
- Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash;
- if (combined == (Py_uhash_t)-1) {
- return -2;
- }
- return (Py_hash_t)combined;
+static inline Py_hash_t complexobject_hash(PyComplexObject *key) {
+ Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
+ Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
+ if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
+ return -1;
+ }
+ Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash;
+ if (combined == (Py_uhash_t)-1) {
+ return -2;
+ }
+ return (Py_hash_t)combined;
}
+static inline khuint32_t kh_python_hash_func(PyObject *key);
-khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
-
-//we could use any hashing algorithm, this is the original CPython's for tuples
+// we could use any hashing algorithm, this is the original CPython's for tuples
#if SIZEOF_PY_UHASH_T > 4
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
-#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
+#define _PandasHASH_XXROTATE(x) \
+ ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
#else
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
-#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
+#define _PandasHASH_XXROTATE(x) \
+ ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
#endif
-Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
- Py_ssize_t i, len = Py_SIZE(key);
- PyObject **item = key->ob_item;
-
- Py_uhash_t acc = _PandasHASH_XXPRIME_5;
- for (i = 0; i < len; i++) {
- Py_uhash_t lane = kh_python_hash_func(item[i]);
- if (lane == (Py_uhash_t)-1) {
- return -1;
- }
- acc += lane * _PandasHASH_XXPRIME_2;
- acc = _PandasHASH_XXROTATE(acc);
- acc *= _PandasHASH_XXPRIME_1;
- }
-
- /* Add input length, mangled to keep the historical value of hash(()). */
- acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
+static inline Py_hash_t tupleobject_hash(PyTupleObject *key) {
+ Py_ssize_t i, len = Py_SIZE(key);
+ PyObject **item = key->ob_item;
- if (acc == (Py_uhash_t)-1) {
- return 1546275796;
+ Py_uhash_t acc = _PandasHASH_XXPRIME_5;
+ for (i = 0; i < len; i++) {
+ Py_uhash_t lane = kh_python_hash_func(item[i]);
+ if (lane == (Py_uhash_t)-1) {
+ return -1;
}
- return acc;
+ acc += lane * _PandasHASH_XXPRIME_2;
+ acc = _PandasHASH_XXROTATE(acc);
+ acc *= _PandasHASH_XXPRIME_1;
+ }
+
+ /* Add input length, mangled to keep the historical value of hash(()). */
+ acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
+
+ if (acc == (Py_uhash_t)-1) {
+ return 1546275796;
+ }
+ return acc;
}
-
-khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
- Py_hash_t hash;
- // For PyObject_Hash holds:
- // hash(0.0) == 0 == hash(-0.0)
- // yet for different nan-objects different hash-values
- // are possible
- if (PyFloat_CheckExact(key)) {
- // we cannot use kh_float64_hash_func
- // because float(k) == k holds for any int-object k
- // and kh_float64_hash_func doesn't respect it
- hash = floatobject_hash((PyFloatObject*)key);
- }
- else if (PyComplex_CheckExact(key)) {
- // we cannot use kh_complex128_hash_func
- // because complex(k,0) == k holds for any int-object k
- // and kh_complex128_hash_func doesn't respect it
- hash = complexobject_hash((PyComplexObject*)key);
- }
- else if (PyTuple_CheckExact(key)) {
- hash = tupleobject_hash((PyTupleObject*)key);
- }
- else {
- hash = PyObject_Hash(key);
- }
-
- if (hash == -1) {
- PyErr_Clear();
- return 0;
- }
- #if SIZEOF_PY_HASH_T == 4
- // it is already 32bit value
- return hash;
- #else
- // for 64bit builds,
- // we need information of the upper 32bits as well
- // see GH 37615
- khuint64_t as_uint = (khuint64_t) hash;
- // uints avoid undefined behavior of signed ints
- return (as_uint>>32)^as_uint;
- #endif
+static inline khuint32_t kh_python_hash_func(PyObject *key) {
+ Py_hash_t hash;
+ // For PyObject_Hash holds:
+ // hash(0.0) == 0 == hash(-0.0)
+ // yet for different nan-objects different hash-values
+ // are possible
+ if (PyFloat_CheckExact(key)) {
+ // we cannot use kh_float64_hash_func
+ // because float(k) == k holds for any int-object k
+ // and kh_float64_hash_func doesn't respect it
+ hash = floatobject_hash((PyFloatObject *)key);
+ } else if (PyComplex_CheckExact(key)) {
+ // we cannot use kh_complex128_hash_func
+ // because complex(k,0) == k holds for any int-object k
+ // and kh_complex128_hash_func doesn't respect it
+ hash = complexobject_hash((PyComplexObject *)key);
+ } else if (PyTuple_CheckExact(key)) {
+ hash = tupleobject_hash((PyTupleObject *)key);
+ } else {
+ hash = PyObject_Hash(key);
+ }
+
+ if (hash == -1) {
+ PyErr_Clear();
+ return 0;
+ }
+#if SIZEOF_PY_HASH_T == 4
+ // it is already 32bit value
+ return hash;
+#else
+ // for 64bit builds,
+ // we need information of the upper 32bits as well
+ // see GH 37615
+ khuint64_t as_uint = (khuint64_t)hash;
+ // uints avoid undefined behavior of signed ints
+ return (as_uint >> 32) ^ as_uint;
+#endif
}
-
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
-
// Python object
-typedef PyObject* kh_pyobject_t;
+typedef PyObject *kh_pyobject_t;
-#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \
- KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \
- kh_python_hash_func, kh_python_hash_equal)
+#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \
+ KHASH_INIT(name, kh_pyobject_t, khval_t, 1, kh_python_hash_func, \
+ kh_python_hash_equal)
KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t)
-#define KHASH_SET_INIT_PYOBJECT(name) \
- KHASH_INIT(name, kh_pyobject_t, char, 0, \
- kh_python_hash_func, kh_python_hash_equal)
+#define KHASH_SET_INIT_PYOBJECT(name) \
+ KHASH_INIT(name, kh_pyobject_t, char, 0, kh_python_hash_func, \
+ kh_python_hash_equal)
KHASH_SET_INIT_PYOBJECT(pyset)
@@ -402,49 +367,52 @@ KHASH_SET_INIT_PYOBJECT(pyset)
KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
typedef struct {
- kh_str_t *table;
- int starts[256];
+ kh_str_t *table;
+ int starts[256];
} kh_str_starts_t;
-typedef kh_str_starts_t* p_kh_str_starts_t;
+typedef kh_str_starts_t *p_kh_str_starts_t;
-p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) {
- kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t));
- result->table = kh_init_str();
- return result;
+static inline p_kh_str_starts_t kh_init_str_starts(void) {
+ kh_str_starts_t *result =
+ (kh_str_starts_t *)KHASH_CALLOC(1, sizeof(kh_str_starts_t));
+ result->table = kh_init_str();
+ return result;
}
-khuint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) {
- khuint_t result = kh_put_str(table->table, key, ret);
- if (*ret != 0) {
- table->starts[(unsigned char)key[0]] = 1;
- }
- return result;
+static inline khuint_t kh_put_str_starts_item(kh_str_starts_t *table, char *key,
+ int *ret) {
+ khuint_t result = kh_put_str(table->table, key, ret);
+ if (*ret != 0) {
+ table->starts[(unsigned char)key[0]] = 1;
+ }
+ return result;
}
-khuint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const char* key) {
- unsigned char ch = *key;
- if (table->starts[ch]) {
- if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1;
- }
- return 0;
+static inline khuint_t kh_get_str_starts_item(const kh_str_starts_t *table,
+ const char *key) {
+ unsigned char ch = *key;
+ if (table->starts[ch]) {
+ if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets)
+ return 1;
+ }
+ return 0;
}
-void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
- kh_destroy_str(table->table);
- KHASH_FREE(table);
+static inline void kh_destroy_str_starts(kh_str_starts_t *table) {
+ kh_destroy_str(table->table);
+ KHASH_FREE(table);
}
-void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
- kh_resize_str(table->table, val);
+static inline void kh_resize_str_starts(kh_str_starts_t *table, khuint_t val) {
+ kh_resize_str(table->table, val);
}
// utility function: given the number of elements
// returns number of necessary buckets
-khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
- khuint_t candidate = n_elements;
- kroundup32(candidate);
- khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
- return (upper_bound < n_elements) ? 2*candidate : candidate;
-
+static inline khuint_t kh_needed_n_buckets(khuint_t n_elements) {
+ khuint_t candidate = n_elements;
+ kroundup32(candidate);
+ khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
+ return (upper_bound < n_elements) ? 2 * candidate : candidate;
}
diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
index 6b5135f559482..e4e90a7ea24cf 100644
--- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
+++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
@@ -18,44 +18,44 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
#ifndef NPY_NO_DEPRECATED_API
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
+#endif // NPY_NO_DEPRECATED_API
#include
typedef struct {
- npy_int64 days;
- npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds;
+ npy_int64 days;
+ npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds;
} pandas_timedeltastruct;
-static const npy_datetimestruct _AS_MIN_DTS = {
- 1969, 12, 31, 23, 59, 50, 776627, 963145, 224193};
-static const npy_datetimestruct _FS_MIN_DTS = {
- 1969, 12, 31, 21, 26, 16, 627963, 145224, 193000};
-static const npy_datetimestruct _PS_MIN_DTS = {
- 1969, 9, 16, 5, 57, 7, 963145, 224193, 0};
-static const npy_datetimestruct _NS_MIN_DTS = {
- 1677, 9, 21, 0, 12, 43, 145224, 193000, 0};
-static const npy_datetimestruct _US_MIN_DTS = {
- -290308, 12, 21, 19, 59, 05, 224193, 0, 0};
-static const npy_datetimestruct _MS_MIN_DTS = {
- -292275055, 5, 16, 16, 47, 4, 193000, 0, 0};
+static const npy_datetimestruct _AS_MIN_DTS = {1969, 12, 31, 23, 59,
+ 50, 776627, 963145, 224193};
+static const npy_datetimestruct _FS_MIN_DTS = {1969, 12, 31, 21, 26,
+ 16, 627963, 145224, 193000};
+static const npy_datetimestruct _PS_MIN_DTS = {1969, 9, 16, 5, 57,
+ 7, 963145, 224193, 0};
+static const npy_datetimestruct _NS_MIN_DTS = {1677, 9, 21, 0, 12,
+ 43, 145224, 193000, 0};
+static const npy_datetimestruct _US_MIN_DTS = {-290308, 12, 21, 19, 59,
+ 05, 224193, 0, 0};
+static const npy_datetimestruct _MS_MIN_DTS = {-292275055, 5, 16, 16, 47,
+ 4, 193000, 0, 0};
static const npy_datetimestruct _S_MIN_DTS = {
-292277022657, 1, 27, 8, 29, 53, 0, 0, 0};
static const npy_datetimestruct _M_MIN_DTS = {
-17536621475646, 5, 4, 5, 53, 0, 0, 0, 0};
-static const npy_datetimestruct _AS_MAX_DTS = {
- 1970, 1, 1, 0, 0, 9, 223372, 36854, 775807};
-static const npy_datetimestruct _FS_MAX_DTS = {
- 1970, 1, 1, 2, 33, 43, 372036, 854775, 807000};
-static const npy_datetimestruct _PS_MAX_DTS = {
- 1970, 4, 17, 18, 2, 52, 36854, 775807, 0};
-static const npy_datetimestruct _NS_MAX_DTS = {
- 2262, 4, 11, 23, 47, 16, 854775, 807000, 0};
-static const npy_datetimestruct _US_MAX_DTS = {
- 294247, 1, 10, 4, 0, 54, 775807, 0, 0};
-static const npy_datetimestruct _MS_MAX_DTS = {
- 292278994, 8, 17, 7, 12, 55, 807000, 0, 0};
+static const npy_datetimestruct _AS_MAX_DTS = {1970, 1, 1, 0, 0,
+ 9, 223372, 36854, 775807};
+static const npy_datetimestruct _FS_MAX_DTS = {1970, 1, 1, 2, 33,
+ 43, 372036, 854775, 807000};
+static const npy_datetimestruct _PS_MAX_DTS = {1970, 4, 17, 18, 2,
+ 52, 36854, 775807, 0};
+static const npy_datetimestruct _NS_MAX_DTS = {2262, 4, 11, 23, 47,
+ 16, 854775, 807000, 0};
+static const npy_datetimestruct _US_MAX_DTS = {294247, 1, 10, 4, 0,
+ 54, 775807, 0, 0};
+static const npy_datetimestruct _MS_MAX_DTS = {292278994, 8, 17, 7, 12,
+ 55, 807000, 0, 0};
static const npy_datetimestruct _S_MAX_DTS = {
292277026596, 12, 4, 15, 30, 7, 0, 0, 0};
static const npy_datetimestruct _M_MAX_DTS = {
@@ -72,8 +72,7 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
void pandas_datetime_to_datetimestruct(npy_datetime val, NPY_DATETIMEUNIT fr,
npy_datetimestruct *result);
-void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
- NPY_DATETIMEUNIT fr,
+void pandas_timedelta_to_timedeltastruct(npy_timedelta val, NPY_DATETIMEUNIT fr,
pandas_timedeltastruct *result);
extern const int days_per_month_table[2][12];
@@ -86,9 +85,7 @@ int is_leapyear(npy_int64 year);
/*
* Calculates the days offset from the 1970 epoch.
*/
-npy_int64
-get_datetimestruct_days(const npy_datetimestruct *dts);
-
+npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts);
/*
* Compares two npy_datetimestruct objects chronologically
@@ -96,17 +93,14 @@ get_datetimestruct_days(const npy_datetimestruct *dts);
int cmp_npy_datetimestruct(const npy_datetimestruct *a,
const npy_datetimestruct *b);
-
/*
* Adjusts a datetimestruct based on a minutes offset. Assumes
* the current values are valid.
*/
-void
-add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes);
+void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes);
/*
* This function returns the DateTimeMetaData
* contained within the provided datetime dtype.
*/
-PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(
- PyArray_Descr *dtype);
+PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype);
diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h
index 1098637e798fe..75e69f30ada1e 100644
--- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h
+++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h
@@ -23,7 +23,7 @@ This file implements string parsing and creation for NumPy datetime.
#ifndef NPY_NO_DEPRECATED_API
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
+#endif // NPY_NO_DEPRECATED_API
/* 'format_requirement' can be one of three values:
* * PARTIAL_MATCH : Only require a partial match with 'format'.
@@ -34,11 +34,7 @@ This file implements string parsing and creation for NumPy datetime.
* be able to parse it without error is '%Y-%m-%d';
* * INFER_FORMAT: parse without comparing 'format' (i.e. infer it).
*/
-typedef enum {
- PARTIAL_MATCH,
- EXACT_MATCH,
- INFER_FORMAT
-} FormatRequirement;
+typedef enum { PARTIAL_MATCH, EXACT_MATCH, INFER_FORMAT } FormatRequirement;
/*
* Parses (almost) standard ISO 8601 date strings. The differences are:
@@ -58,31 +54,26 @@ typedef enum {
* 'str' must be a NULL-terminated string, and 'len' must be its length.
*
* 'out' gets filled with the parsed date-time.
- * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time.
- * 'out_tzoffset' gets set to timezone offset by minutes
- * if the parsed time was in local time,
- * to 0 otherwise. The values 'now' and 'today' don't get counted
- * as local, and neither do UTC +/-#### timezone offsets, because
- * they aren't using the computer's local timezone offset.
+ * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for
+ * local time. 'out_tzoffset' gets set to timezone offset by minutes if the
+ * parsed time was in local time, to 0 otherwise. The values 'now' and 'today'
+ * don't get counted as local, and neither do UTC +/-#### timezone offsets,
+ * because they aren't using the computer's local timezone offset.
*
* Returns 0 on success, -1 on failure.
*/
-int
-parse_iso_8601_datetime(const char *str, int len, int want_exc,
- npy_datetimestruct *out,
- NPY_DATETIMEUNIT *out_bestunit,
- int *out_local,
- int *out_tzoffset,
- const char* format,
- int format_len,
- FormatRequirement format_requirement);
+int parse_iso_8601_datetime(const char *str, int len, int want_exc,
+ npy_datetimestruct *out,
+ NPY_DATETIMEUNIT *out_bestunit, int *out_local,
+ int *out_tzoffset, const char *format,
+ int format_len,
+ FormatRequirement format_requirement);
/*
* Provides a string length to use for converting datetime
* objects with the given local and unit settings.
*/
-int
-get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
+int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
/*
* Converts an npy_datetimestruct to an (almost) ISO 8601
@@ -94,9 +85,8 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
* Returns 0 on success, -1 on failure (for example if the output
* string was too short).
*/
-int
-make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
- int utc, NPY_DATETIMEUNIT base);
+int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen,
+ int utc, NPY_DATETIMEUNIT base);
/*
* Converts an pandas_timedeltastruct to an ISO 8601 string.
diff --git a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h
index 54bcca9e4136c..0d62bb0ba915c 100644
--- a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h
+++ b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h
@@ -16,18 +16,19 @@ modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
+reserved.
Numeric decoder derived from TCL library
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
@@ -51,9 +52,9 @@ tree doesn't have cyclic references.
#pragma once
+#include "pandas/portable.h"
#include
#include
-#include "pandas/portable.h"
// Don't output any extra whitespaces when encoding
#define JSON_NO_EXTRA_WHITESPACE
@@ -74,7 +75,8 @@ tree doesn't have cyclic references.
#endif
/*
-Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */
+Dictates and limits how much stack space for buffers UltraJSON will use before
+resorting to provided heap functions */
#ifndef JSON_MAX_STACK_BUFFER_SIZE
#define JSON_MAX_STACK_BUFFER_SIZE 131072
#endif
@@ -138,23 +140,23 @@ typedef int64_t JSLONG;
#endif
enum JSTYPES {
- JT_NULL, // NULL
- JT_TRUE, // boolean true
- JT_FALSE, // boolean false
- JT_INT, // (JSINT32 (signed 32-bit))
- JT_LONG, // (JSINT64 (signed 64-bit))
- JT_DOUBLE, // (double)
- JT_BIGNUM, // integer larger than sys.maxsize
- JT_UTF8, // (char 8-bit)
- JT_ARRAY, // Array structure
- JT_OBJECT, // Key/Value structure
- JT_INVALID, // Internal, do not return nor expect
- JT_POS_INF, // Positive infinity
- JT_NEG_INF, // Negative infinity
+ JT_NULL, // NULL
+ JT_TRUE, // boolean true
+ JT_FALSE, // boolean false
+ JT_INT, // (JSINT32 (signed 32-bit))
+ JT_LONG, // (JSINT64 (signed 64-bit))
+ JT_DOUBLE, // (double)
+ JT_BIGNUM, // integer larger than sys.maxsize
+ JT_UTF8, // (char 8-bit)
+ JT_ARRAY, // Array structure
+ JT_OBJECT, // Key/Value structure
+ JT_INVALID, // Internal, do not return nor expect
+ JT_POS_INF, // Positive infinity
+ JT_NEG_INF, // Negative infinity
};
-typedef void * JSOBJ;
-typedef void * JSITER;
+typedef void *JSOBJ;
+typedef void *JSITER;
typedef struct __JSONTypeContext {
int type;
@@ -183,17 +185,18 @@ typedef struct __JSONObjectEncoder {
JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc);
double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc);
const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc,
- size_t *_outLen);
+ size_t *_outLen);
/*
Begin iteration of an iterable object (JS_ARRAY or JS_OBJECT)
- Implementor should setup iteration state in ti->prv
+ Implementer should setup iteration state in ti->prv
*/
JSPFN_ITERBEGIN iterBegin;
/*
- Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items.
- Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this
+ Retrieve next object in an iteration. Should return 0 to indicate iteration
+ has reached end or 1 if there are more items. Implementer is responsible for
+ keeping state of the iteration. Use ti->prv fields for this
*/
JSPFN_ITERNEXT iterNext;
@@ -205,19 +208,22 @@ typedef struct __JSONObjectEncoder {
/*
Returns a reference to the value object of an iterator
- The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
+ The is responsible for the life-cycle of the returned string. Use
+ iterNext/iterEnd and ti->prv to keep track of current object
*/
JSPFN_ITERGETVALUE iterGetValue;
/*
Return name of iterator.
- The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
+ The is responsible for the life-cycle of the returned string. Use
+ iterNext/iterEnd and ti->prv to keep track of current object
*/
JSPFN_ITERGETNAME iterGetName;
/*
- Release a value as indicated by setting ti->release = 1 in the previous getValue call.
- The ti->prv array should contain the necessary context to release the value
+ Release a value as indicated by setting ti->release = 1 in the previous
+ getValue call. The ti->prv array should contain the necessary context to
+ release the value
*/
void (*releaseObject)(JSOBJ obj);
@@ -228,19 +234,23 @@ typedef struct __JSONObjectEncoder {
JSPFN_FREE free;
/*
- Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/
+ Configuration for max recursion, set to 0 to use default (see
+ JSON_MAX_RECURSION_DEPTH)*/
int recursionMax;
/*
- Configuration for max decimals of double floating point numbers to encode (0-9) */
+ Configuration for max decimals of double floating point numbers to encode
+ (0-9) */
int doublePrecision;
/*
- If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
+ If true output will be ASCII with all characters above 127 encoded as \uXXXX.
+ If false output will be UTF-8 or what ever charset strings are brought as */
int forceASCII;
/*
- If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */
+ If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and
+ \u0026, respectively. If false, no special encoding will be used. */
int encodeHTMLChars;
/*
@@ -266,18 +276,20 @@ Encode an object structure into JSON.
Arguments:
obj - An anonymous type representing the object
enc - Function definitions for querying JSOBJ type
-buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
-cbBuffer - Length of buffer (ignored if buffer is NULL)
+buffer - Preallocated buffer to store result in. If NULL function allocates own
+buffer cbBuffer - Length of buffer (ignored if buffer is NULL)
Returns:
Encoded JSON object as a null terminated char string.
NOTE:
-If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
-Life cycle of the provided buffer must still be handled by caller.
+If the supplied buffer wasn't enough to hold the result the function will
+allocate a new buffer. Life cycle of the provided buffer must still be handled
+by caller.
-If the return value doesn't equal the specified buffer caller must release the memory using
-JSONObjectEncoder.free or free() as specified when calling this function.
+If the return value doesn't equal the specified buffer caller must release the
+memory using JSONObjectEncoder.free or free() as specified when calling this
+function.
*/
EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc,
char *buffer, size_t cbBuffer);
diff --git a/pandas/_libs/include/pandas/vendored/ujson/python/version.h b/pandas/_libs/include/pandas/vendored/ujson/python/version.h
deleted file mode 100644
index 97232dd821387..0000000000000
--- a/pandas/_libs/include/pandas/vendored/ujson/python/version.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the ESN Social Software AB nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
- * Copyright (c) 1988-1993 The Regents of the University of California.
- * Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-#pragma once
-
-#define UJSON_VERSION "1.33"
diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
index 8321200a84b76..75db47bf3160e 100644
--- a/pandas/_libs/index.pyi
+++ b/pandas/_libs/index.pyi
@@ -80,13 +80,6 @@ class BaseMultiIndexCodesEngine:
) -> None: ...
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
- def get_indexer_with_fill(
- self,
- target: np.ndarray, # np.ndarray[object] of tuples
- values: np.ndarray, # np.ndarray[object] of tuples
- method: str,
- limit: int | None,
- ) -> npt.NDArray[np.intp]: ...
class ExtensionEngine:
def __init__(self, values: ExtensionArray) -> None: ...
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index e974b5d0eec46..0dc139781f58d 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -1,4 +1,5 @@
cimport cython
+from cpython.sequence cimport PySequence_GetItem
import numpy as np
@@ -77,7 +78,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
indexer = np.empty(len(values), dtype=np.uint8)
for i in range(len(values)):
- item = values[i]
+ item = PySequence_GetItem(values, i)
indexer[i] = is_matching_na(item, val)
else:
@@ -354,7 +355,7 @@ cdef class IndexEngine:
dict d = {}
object val
Py_ssize_t count = 0, count_missing = 0
- Py_ssize_t i, j, n, n_t, n_alloc, start, end
+ Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end
bint check_na_values = False
values = self.values
@@ -364,6 +365,7 @@ cdef class IndexEngine:
n = len(values)
n_t = len(targets)
+ max_alloc = n * n_t
if n > 10_000:
n_alloc = 10_000
else:
@@ -375,7 +377,7 @@ cdef class IndexEngine:
# map each starget to its position in the index
if (
stargets and
- len(stargets) < 5 and
+ len(stargets) < (n / (2 * n.bit_length())) and
not na_in_stargets and
self.is_monotonic_increasing
):
@@ -404,7 +406,7 @@ cdef class IndexEngine:
found_nas = set()
for i in range(n):
- val = values[i]
+ val = PySequence_GetItem(values, i)
# GH#43870
# handle lookup for nas
@@ -436,7 +438,7 @@ cdef class IndexEngine:
d[val].append(i)
for i in range(n_t):
- val = targets[i]
+ val = PySequence_GetItem(targets, i)
# ensure there are nas in values before looking for a matching na
if check_na_values and checknull(val):
@@ -453,7 +455,9 @@ cdef class IndexEngine:
# realloc if needed
if count >= n_alloc:
- n_alloc += 10_000
+ n_alloc *= 2
+ if n_alloc > max_alloc:
+ n_alloc = max_alloc
result = np.resize(result, n_alloc)
result[count] = j
@@ -463,7 +467,9 @@ cdef class IndexEngine:
else:
if count >= n_alloc:
- n_alloc += 10_000
+ n_alloc *= 2
+ if n_alloc > max_alloc:
+ n_alloc = max_alloc
result = np.resize(result, n_alloc)
result[count] = -1
count += 1
@@ -483,22 +489,22 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
object pval
- if hi == 0 or (hi > 0 and val > values[hi]):
+ if hi == 0 or (hi > 0 and val > PySequence_GetItem(values, hi)):
return len(values)
while lo < hi:
mid = (lo + hi) // 2
- pval = values[mid]
+ pval = PySequence_GetItem(values, mid)
if val < pval:
hi = mid
elif val > pval:
lo = mid + 1
else:
- while mid > 0 and val == values[mid - 1]:
+ while mid > 0 and val == PySequence_GetItem(values, mid - 1):
mid -= 1
return mid
- if val <= values[mid]:
+ if val <= PySequence_GetItem(values, mid):
return mid
else:
return mid + 1
@@ -586,7 +592,7 @@ cdef class DatetimeEngine(Int64Engine):
loc = values.searchsorted(conv, side="left")
- if loc == len(values) or values[loc] != conv:
+ if loc == len(values) or PySequence_GetItem(values, loc) != conv:
raise KeyError(val)
return loc
@@ -748,91 +754,6 @@ cdef class BaseMultiIndexCodesEngine:
"""
return self._base.get_indexer(self, target)
- def get_indexer_with_fill(self, ndarray target, ndarray values,
- str method, object limit) -> np.ndarray:
- """
- Returns an array giving the positions of each value of `target` in
- `values`, where -1 represents a value in `target` which does not
- appear in `values`
-
- If `method` is "backfill" then the position for a value in `target`
- which does not appear in `values` is that of the next greater value
- in `values` (if one exists), and -1 if there is no such value.
-
- Similarly, if the method is "pad" then the position for a value in
- `target` which does not appear in `values` is that of the next smaller
- value in `values` (if one exists), and -1 if there is no such value.
-
- Parameters
- ----------
- target: ndarray[object] of tuples
- need not be sorted, but all must have the same length, which must be
- the same as the length of all tuples in `values`
- values : ndarray[object] of tuples
- must be sorted and all have the same length. Should be the set of
- the MultiIndex's values.
- method: string
- "backfill" or "pad"
- limit: int or None
- if provided, limit the number of fills to this value
-
- Returns
- -------
- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
- filled with the `method` (and optionally `limit`) specified
- """
- assert method in ("backfill", "pad")
- cdef:
- int64_t i, j, next_code
- int64_t num_values, num_target_values
- ndarray[int64_t, ndim=1] target_order
- ndarray[object, ndim=1] target_values
- ndarray[int64_t, ndim=1] new_codes, new_target_codes
- ndarray[intp_t, ndim=1] sorted_indexer
-
- target_order = np.argsort(target).astype("int64")
- target_values = target[target_order]
- num_values, num_target_values = len(values), len(target_values)
- new_codes, new_target_codes = (
- np.empty((num_values,)).astype("int64"),
- np.empty((num_target_values,)).astype("int64"),
- )
-
- # `values` and `target_values` are both sorted, so we walk through them
- # and memoize the (ordered) set of indices in the (implicit) merged-and
- # sorted list of the two which belong to each of them
- # the effect of this is to create a factorization for the (sorted)
- # merger of the index values, where `new_codes` and `new_target_codes`
- # are the subset of the factors which appear in `values` and `target`,
- # respectively
- i, j, next_code = 0, 0, 0
- while i < num_values and j < num_target_values:
- val, target_val = values[i], target_values[j]
- if val <= target_val:
- new_codes[i] = next_code
- i += 1
- if target_val <= val:
- new_target_codes[j] = next_code
- j += 1
- next_code += 1
-
- # at this point, at least one should have reached the end
- # the remaining values of the other should be added to the end
- assert i == num_values or j == num_target_values
- while i < num_values:
- new_codes[i] = next_code
- i += 1
- next_code += 1
- while j < num_target_values:
- new_target_codes[j] = next_code
- j += 1
- next_code += 1
-
- # get the indexer, and undo the sorting of `target.values`
- algo = algos.backfill if method == "backfill" else algos.pad
- sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
- return sorted_indexer[np.argsort(target_order)]
-
def get_loc(self, object key):
if is_definitely_invalid_key(key):
raise TypeError(f"'{key}' is an invalid key")
@@ -1042,7 +963,7 @@ cdef class SharedEngine:
res = np.empty(N, dtype=np.intp)
for i in range(N):
- val = values[i]
+ val = PySequence_GetItem(values, i)
try:
loc = self.get_loc(val)
# Because we are unique, loc should always be an integer
@@ -1076,7 +997,7 @@ cdef class SharedEngine:
# See also IntervalIndex.get_indexer_pointwise
for i in range(N):
- val = targets[i]
+ val = PySequence_GetItem(targets, i)
try:
locs = self.get_loc(val)
@@ -1211,7 +1132,7 @@ cdef class MaskedIndexEngine(IndexEngine):
dict d = {}
object val
Py_ssize_t count = 0, count_missing = 0
- Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
+ Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end, na_idx
target_vals = self._get_data(targets)
target_mask = self._get_mask(targets)
@@ -1224,6 +1145,7 @@ cdef class MaskedIndexEngine(IndexEngine):
n = len(values)
n_t = len(target_vals)
+ max_alloc = n * n_t
if n > 10_000:
n_alloc = 10_000
else:
@@ -1255,9 +1177,9 @@ cdef class MaskedIndexEngine(IndexEngine):
na_pos = []
for i in range(n):
- val = values[i]
+ val = PySequence_GetItem(values, i)
- if mask[i]:
+ if PySequence_GetItem(mask, i):
na_pos.append(i)
else:
@@ -1267,15 +1189,16 @@ cdef class MaskedIndexEngine(IndexEngine):
d[val].append(i)
for i in range(n_t):
- val = target_vals[i]
+ val = PySequence_GetItem(target_vals, i)
- if target_mask[i]:
+ if PySequence_GetItem(target_mask, i):
if na_pos:
for na_idx in na_pos:
# realloc if needed
if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
+ n_alloc *= 2
+ if n_alloc > max_alloc:
+ n_alloc = max_alloc
result[count] = na_idx
count += 1
@@ -1289,8 +1212,9 @@ cdef class MaskedIndexEngine(IndexEngine):
# realloc if needed
if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
+ n_alloc *= 2
+ if n_alloc > max_alloc:
+ n_alloc = max_alloc
result[count] = j
count += 1
diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
index ce112413f8a64..ffe6c7730bcdc 100644
--- a/pandas/_libs/internals.pyi
+++ b/pandas/_libs/internals.pyi
@@ -15,7 +15,6 @@ from pandas._typing import (
)
from pandas import Index
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.internals.blocks import Block as B
def slice_len(slc: slice, objlen: int = ...) -> int: ...
@@ -60,7 +59,7 @@ class BlockPlacement:
def append(self, others: list[BlockPlacement]) -> BlockPlacement: ...
def tile_for_unstack(self, factor: int) -> npt.NDArray[np.intp]: ...
-class SharedBlock:
+class Block:
_mgr_locs: BlockPlacement
ndim: int
values: ArrayLike
@@ -72,19 +71,8 @@ class SharedBlock:
ndim: int,
refs: BlockValuesRefs | None = ...,
) -> None: ...
-
-class NumpyBlock(SharedBlock):
- values: np.ndarray
- @final
- def slice_block_rows(self, slicer: slice) -> Self: ...
-
-class NDArrayBackedBlock(SharedBlock):
- values: NDArrayBackedExtensionArray
- @final
def slice_block_rows(self, slicer: slice) -> Self: ...
-class Block(SharedBlock): ...
-
class BlockManager:
blocks: tuple[B, ...]
axes: list[Index]
@@ -100,7 +88,7 @@ class BlockManager:
class BlockValuesRefs:
referenced_blocks: list[weakref.ref]
- def __init__(self, blk: SharedBlock | None = ...) -> None: ...
- def add_reference(self, blk: SharedBlock) -> None: ...
+ def __init__(self, blk: Block | None = ...) -> None: ...
+ def add_reference(self, blk: Block) -> None: ...
def add_index_reference(self, index: Index) -> None: ...
def has_reference(self) -> bool: ...
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index adf4e8c926fa3..05c4e7bd5e9dc 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -2,14 +2,10 @@ from collections import defaultdict
import weakref
cimport cython
+from cpython.pyport cimport PY_SSIZE_T_MAX
from cpython.slice cimport PySlice_GetIndicesEx
from cython cimport Py_ssize_t
-
-cdef extern from "Python.h":
- # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX
- Py_ssize_t PY_SSIZE_T_MAX
-
import numpy as np
cimport numpy as cnp
@@ -24,7 +20,6 @@ cnp.import_array()
from pandas._libs.algos import ensure_int64
-from pandas._libs.arrays cimport NDArrayBacked
from pandas._libs.util cimport (
is_array,
is_integer_object,
@@ -639,7 +634,7 @@ def _unpickle_block(values, placement, ndim):
@cython.freelist(64)
-cdef class SharedBlock:
+cdef class Block:
"""
Defining __init__ in a cython class significantly improves performance.
"""
@@ -647,6 +642,11 @@ cdef class SharedBlock:
public BlockPlacement _mgr_locs
public BlockValuesRefs refs
readonly int ndim
+ # 2023-08-15 no apparent performance improvement from declaring values
+ # as ndarray in a type-special subclass (similar for NDArrayBacked).
+ # This might change if slice_block_rows can be optimized with something
+ # like https://github.com/numpy/numpy/issues/23934
+ public object values
def __cinit__(
self,
@@ -666,6 +666,8 @@ cdef class SharedBlock:
refs: BlockValuesRefs, optional
Ref tracking object or None if block does not have any refs.
"""
+ self.values = values
+
self._mgr_locs = placement
self.ndim = ndim
if refs is None:
@@ -699,23 +701,7 @@ cdef class SharedBlock:
ndim = maybe_infer_ndim(self.values, self.mgr_locs)
self.ndim = ndim
-
-cdef class NumpyBlock(SharedBlock):
- cdef:
- public ndarray values
-
- def __cinit__(
- self,
- ndarray values,
- BlockPlacement placement,
- int ndim,
- refs: BlockValuesRefs | None = None,
- ):
- # set values here; the (implicit) call to SharedBlock.__cinit__ will
- # set placement, ndim and refs
- self.values = values
-
- cpdef NumpyBlock slice_block_rows(self, slice slicer):
+ cpdef Block slice_block_rows(self, slice slicer):
"""
Perform __getitem__-like specialized to slicing along index.
@@ -725,50 +711,6 @@ cdef class NumpyBlock(SharedBlock):
return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
-cdef class NDArrayBackedBlock(SharedBlock):
- """
- Block backed by NDArrayBackedExtensionArray
- """
- cdef public:
- NDArrayBacked values
-
- def __cinit__(
- self,
- NDArrayBacked values,
- BlockPlacement placement,
- int ndim,
- refs: BlockValuesRefs | None = None,
- ):
- # set values here; the (implicit) call to SharedBlock.__cinit__ will
- # set placement, ndim and refs
- self.values = values
-
- cpdef NDArrayBackedBlock slice_block_rows(self, slice slicer):
- """
- Perform __getitem__-like specialized to slicing along index.
-
- Assumes self.ndim == 2
- """
- new_values = self.values[..., slicer]
- return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
-
-
-cdef class Block(SharedBlock):
- cdef:
- public object values
-
- def __cinit__(
- self,
- object values,
- BlockPlacement placement,
- int ndim,
- refs: BlockValuesRefs | None = None,
- ):
- # set values here; the (implicit) call to SharedBlock.__cinit__ will
- # set placement, ndim and refs
- self.values = values
-
-
@cython.freelist(64)
cdef class BlockManager:
cdef:
@@ -811,7 +753,7 @@ cdef class BlockManager:
cdef:
intp_t blkno, i, j
cnp.npy_intp length = self.shape[0]
- SharedBlock blk
+ Block blk
BlockPlacement bp
ndarray[intp_t, ndim=1] new_blknos, new_blklocs
@@ -901,7 +843,7 @@ cdef class BlockManager:
cdef BlockManager _slice_mgr_rows(self, slice slobj):
cdef:
- SharedBlock blk, nb
+ Block blk, nb
BlockManager mgr
ndarray blknos, blklocs
@@ -944,21 +886,39 @@ cdef class BlockValuesRefs:
"""
cdef:
public list referenced_blocks
+ public int clear_counter
- def __cinit__(self, blk: SharedBlock | None = None) -> None:
+ def __cinit__(self, blk: Block | None = None) -> None:
if blk is not None:
self.referenced_blocks = [weakref.ref(blk)]
else:
self.referenced_blocks = []
+ self.clear_counter = 500 # set reasonably high
+
+ def _clear_dead_references(self, force=False) -> None:
+ # Use exponential backoff to decide when we want to clear references
+ # if force=False. Clearing for every insertion causes slowdowns if
+ # all these objects stay alive, e.g. df.items() for wide DataFrames
+ # see GH#55245 and GH#55008
+ if force or len(self.referenced_blocks) > self.clear_counter:
+ self.referenced_blocks = [
+ ref for ref in self.referenced_blocks if ref() is not None
+ ]
+ nr_of_refs = len(self.referenced_blocks)
+ if nr_of_refs < self.clear_counter // 2:
+ self.clear_counter = max(self.clear_counter // 2, 500)
+ elif nr_of_refs > self.clear_counter:
+ self.clear_counter = max(self.clear_counter * 2, nr_of_refs)
- def add_reference(self, blk: SharedBlock) -> None:
+ def add_reference(self, blk: Block) -> None:
"""Adds a new reference to our reference collection.
Parameters
----------
- blk: SharedBlock
+ blk : Block
The block that the new references should point to.
"""
+ self._clear_dead_references()
self.referenced_blocks.append(weakref.ref(blk))
def add_index_reference(self, index: object) -> None:
@@ -969,6 +929,7 @@ cdef class BlockValuesRefs:
index : Index
The index that the new reference should point to.
"""
+ self._clear_dead_references()
self.referenced_blocks.append(weakref.ref(index))
def has_reference(self) -> bool:
@@ -981,8 +942,6 @@ cdef class BlockValuesRefs:
-------
bool
"""
- self.referenced_blocks = [
- ref for ref in self.referenced_blocks if ref() is not None
- ]
+ self._clear_dead_references(force=True)
# Checking for more references than block pointing to itself
return len(self.referenced_blocks) > 1
diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx
index e07d80dd04b31..a37ab45dd57ed 100644
--- a/pandas/_libs/interval.pyx
+++ b/pandas/_libs/interval.pyx
@@ -39,7 +39,6 @@ from pandas._libs.tslibs.timezones cimport tz_compare
from pandas._libs.tslibs.util cimport (
is_float_object,
is_integer_object,
- is_timedelta64_object,
)
VALID_CLOSED = frozenset(["left", "right", "both", "neither"])
@@ -478,57 +477,31 @@ cdef class Interval(IntervalMixin):
args = (self.left, self.right, self.closed)
return (type(self), args)
- def _repr_base(self):
- left = self.left
- right = self.right
-
- # TODO: need more general formatting methodology here
- if isinstance(left, _Timestamp) and isinstance(right, _Timestamp):
- left = left._short_repr
- right = right._short_repr
-
- return left, right
-
def __repr__(self) -> str:
-
- left, right = self._repr_base()
- disp = str if isinstance(left, np.generic) else repr
+ disp = str if isinstance(self.left, (np.generic, _Timestamp)) else repr
name = type(self).__name__
- repr_str = f"{name}({disp(left)}, {disp(right)}, closed={repr(self.closed)})"
+ repr_str = f"{name}({disp(self.left)}, {disp(self.right)}, closed={repr(self.closed)})" # noqa: E501
return repr_str
def __str__(self) -> str:
-
- left, right = self._repr_base()
start_symbol = "[" if self.closed_left else "("
end_symbol = "]" if self.closed_right else ")"
- return f"{start_symbol}{left}, {right}{end_symbol}"
+ return f"{start_symbol}{self.left}, {self.right}{end_symbol}"
def __add__(self, y):
if (
isinstance(y, numbers.Number)
or PyDelta_Check(y)
- or is_timedelta64_object(y)
+ or cnp.is_timedelta64_object(y)
):
return Interval(self.left + y, self.right + y, closed=self.closed)
- elif (
- # __radd__ pattern
- # TODO(cython3): remove this
- isinstance(y, Interval)
- and (
- isinstance(self, numbers.Number)
- or PyDelta_Check(self)
- or is_timedelta64_object(self)
- )
- ):
- return Interval(y.left + self, y.right + self, closed=y.closed)
return NotImplemented
def __radd__(self, other):
if (
isinstance(other, numbers.Number)
or PyDelta_Check(other)
- or is_timedelta64_object(other)
+ or cnp.is_timedelta64_object(other)
):
return Interval(self.left + other, self.right + other, closed=self.closed)
return NotImplemented
@@ -537,7 +510,7 @@ cdef class Interval(IntervalMixin):
if (
isinstance(y, numbers.Number)
or PyDelta_Check(y)
- or is_timedelta64_object(y)
+ or cnp.is_timedelta64_object(y)
):
return Interval(self.left - y, self.right - y, closed=self.closed)
return NotImplemented
@@ -545,10 +518,6 @@ cdef class Interval(IntervalMixin):
def __mul__(self, y):
if isinstance(y, numbers.Number):
return Interval(self.left * y, self.right * y, closed=self.closed)
- elif isinstance(y, Interval) and isinstance(self, numbers.Number):
- # __radd__ semantics
- # TODO(cython3): remove this
- return Interval(y.left * self, y.right * self, closed=y.closed)
return NotImplemented
def __rmul__(self, other):
diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in
index 0b99aebbd3816..a6cec0fb30ecc 100644
--- a/pandas/_libs/intervaltree.pxi.in
+++ b/pandas/_libs/intervaltree.pxi.in
@@ -391,6 +391,11 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode(IntervalNode):
"""Recursively query this node and its sub-nodes for intervals that
overlap with the query point.
"""
+
+ # GH 51826: ensures nan is handled properly during reindexing
+ if np.isnan(point):
+ return
+
cdef:
int64_t[:] indices
{{dtype}}_t[:] values
diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi
index 7ee649a55fd8f..1d4e8c90bc559 100644
--- a/pandas/_libs/join.pyi
+++ b/pandas/_libs/join.pyi
@@ -6,6 +6,7 @@ def inner_join(
left: np.ndarray, # const intp_t[:]
right: np.ndarray, # const intp_t[:]
max_groups: int,
+ sort: bool = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def left_outer_join(
left: np.ndarray, # const intp_t[:]
@@ -52,8 +53,8 @@ def outer_join_indexer(
def asof_join_backward_on_X_by_Y(
left_values: np.ndarray, # ndarray[numeric_t]
right_values: np.ndarray, # ndarray[numeric_t]
- left_by_values: np.ndarray, # ndarray[by_t]
- right_by_values: np.ndarray, # ndarray[by_t]
+ left_by_values: np.ndarray, # const int64_t[:]
+ right_by_values: np.ndarray, # const int64_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | float | None = ...,
use_hashtable: bool = ...,
@@ -61,8 +62,8 @@ def asof_join_backward_on_X_by_Y(
def asof_join_forward_on_X_by_Y(
left_values: np.ndarray, # ndarray[numeric_t]
right_values: np.ndarray, # ndarray[numeric_t]
- left_by_values: np.ndarray, # ndarray[by_t]
- right_by_values: np.ndarray, # ndarray[by_t]
+ left_by_values: np.ndarray, # const int64_t[:]
+ right_by_values: np.ndarray, # const int64_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | float | None = ...,
use_hashtable: bool = ...,
@@ -70,8 +71,8 @@ def asof_join_forward_on_X_by_Y(
def asof_join_nearest_on_X_by_Y(
left_values: np.ndarray, # ndarray[numeric_t]
right_values: np.ndarray, # ndarray[numeric_t]
- left_by_values: np.ndarray, # ndarray[by_t]
- right_by_values: np.ndarray, # ndarray[by_t]
+ left_by_values: np.ndarray, # const int64_t[:]
+ right_by_values: np.ndarray, # const int64_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | float | None = ...,
use_hashtable: bool = ...,
diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx
index 5929647468785..368abe60d7237 100644
--- a/pandas/_libs/join.pyx
+++ b/pandas/_libs/join.pyx
@@ -7,7 +7,6 @@ from numpy cimport (
int64_t,
intp_t,
ndarray,
- uint64_t,
)
cnp.import_array()
@@ -23,7 +22,7 @@ from pandas._libs.dtypes cimport (
@cython.wraparound(False)
@cython.boundscheck(False)
def inner_join(const intp_t[:] left, const intp_t[:] right,
- Py_ssize_t max_groups):
+ Py_ssize_t max_groups, bint sort=True):
cdef:
Py_ssize_t i, j, k, count = 0
intp_t[::1] left_sorter, right_sorter
@@ -70,7 +69,20 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
_get_result_indexer(left_sorter, left_indexer)
_get_result_indexer(right_sorter, right_indexer)
- return np.asarray(left_indexer), np.asarray(right_indexer)
+ if not sort:
+ # if not asked to sort, revert to original order
+ if len(left) == len(left_indexer):
+ # no multiple matches for any row on the left
+ # this is a short-cut to avoid groupsort_indexer
+ # otherwise, the `else` path also works in this case
+ rev = np.empty(len(left), dtype=np.intp)
+ rev.put(np.asarray(left_sorter), np.arange(len(left)))
+ else:
+ rev, _ = groupsort_indexer(left_indexer, len(left))
+
+ return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev)
+ else:
+ return np.asarray(left_indexer), np.asarray(right_indexer)
@cython.wraparound(False)
@@ -666,23 +678,13 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t]
# asof_join_by
# ----------------------------------------------------------------------
-from pandas._libs.hashtable cimport (
- HashTable,
- Int64HashTable,
- PyObjectHashTable,
- UInt64HashTable,
-)
-
-ctypedef fused by_t:
- object
- int64_t
- uint64_t
+from pandas._libs.hashtable cimport Int64HashTable
def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
ndarray[numeric_t] right_values,
- ndarray[by_t] left_by_values,
- ndarray[by_t] right_by_values,
+ const int64_t[:] left_by_values,
+ const int64_t[:] right_by_values,
bint allow_exact_matches=True,
tolerance=None,
bint use_hashtable=True):
@@ -693,8 +695,7 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
bint has_tolerance = False
numeric_t tolerance_ = 0
numeric_t diff = 0
- HashTable hash_table
- by_t by_value
+ Int64HashTable hash_table
# if we are using tolerance, set our objects
if tolerance is not None:
@@ -708,12 +709,7 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
right_indexer = np.empty(left_size, dtype=np.intp)
if use_hashtable:
- if by_t is object:
- hash_table = PyObjectHashTable(right_size)
- elif by_t is int64_t:
- hash_table = Int64HashTable(right_size)
- elif by_t is uint64_t:
- hash_table = UInt64HashTable(right_size)
+ hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
@@ -758,8 +754,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
ndarray[numeric_t] right_values,
- ndarray[by_t] left_by_values,
- ndarray[by_t] right_by_values,
+ const int64_t[:] left_by_values,
+ const int64_t[:] right_by_values,
bint allow_exact_matches=1,
tolerance=None,
bint use_hashtable=True):
@@ -770,8 +766,7 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
bint has_tolerance = False
numeric_t tolerance_ = 0
numeric_t diff = 0
- HashTable hash_table
- by_t by_value
+ Int64HashTable hash_table
# if we are using tolerance, set our objects
if tolerance is not None:
@@ -785,12 +780,7 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
right_indexer = np.empty(left_size, dtype=np.intp)
if use_hashtable:
- if by_t is object:
- hash_table = PyObjectHashTable(right_size)
- elif by_t is int64_t:
- hash_table = Int64HashTable(right_size)
- elif by_t is uint64_t:
- hash_table = UInt64HashTable(right_size)
+ hash_table = Int64HashTable(right_size)
right_pos = right_size - 1
for left_pos in range(left_size - 1, -1, -1):
@@ -836,8 +826,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values,
ndarray[numeric_t] right_values,
- ndarray[by_t] left_by_values,
- ndarray[by_t] right_by_values,
+ const int64_t[:] left_by_values,
+ const int64_t[:] right_by_values,
bint allow_exact_matches=True,
tolerance=None,
bint use_hashtable=True):
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 32641319a6b96..b9fd970e68f5b 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -45,25 +45,31 @@ def is_scalar(val: object) -> bool: ...
def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ...
def is_pyarrow_array(obj: object) -> bool: ...
def is_period(val: object) -> TypeGuard[Period]: ...
-def is_interval(val: object) -> TypeGuard[Interval]: ...
-def is_decimal(val: object) -> TypeGuard[Decimal]: ...
-def is_complex(val: object) -> TypeGuard[complex]: ...
-def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ...
-def is_integer(val: object) -> TypeGuard[int | np.integer]: ...
+def is_interval(obj: object) -> TypeGuard[Interval]: ...
+def is_decimal(obj: object) -> TypeGuard[Decimal]: ...
+def is_complex(obj: object) -> TypeGuard[complex]: ...
+def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ...
+def is_integer(obj: object) -> TypeGuard[int | np.integer]: ...
def is_int_or_none(obj) -> bool: ...
-def is_float(val: object) -> TypeGuard[float]: ...
+def is_float(obj: object) -> TypeGuard[float]: ...
def is_interval_array(values: np.ndarray) -> bool: ...
-def is_datetime64_array(values: np.ndarray) -> bool: ...
-def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
+def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ...
+def is_timedelta_or_timedelta64_array(
+ values: np.ndarray, skipna: bool = True
+) -> bool: ...
def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
def is_time_array(values: np.ndarray, skipna: bool = ...): ...
def is_date_array(values: np.ndarray, skipna: bool = ...): ...
def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
def is_string_array(values: np.ndarray, skipna: bool = ...): ...
-def is_float_array(values: np.ndarray, skipna: bool = ...): ...
+def is_float_array(values: np.ndarray): ...
def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
-def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
+def fast_multiget(
+ mapping: dict,
+ keys: np.ndarray, # object[:]
+ default=...,
+) -> np.ndarray: ...
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
def map_infer(
@@ -181,7 +187,7 @@ def count_level_2d(
max_bin: int,
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2]
def get_level_sorter(
- label: np.ndarray, # const int64_t[:]
+ codes: np.ndarray, # const int64_t[:]
starts: np.ndarray, # const intp_t[:]
) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1]
def generate_bins_dt64(
@@ -195,6 +201,7 @@ def array_equivalent_object(
right: npt.NDArray[np.object_],
) -> bool: ...
def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
+def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:]
def get_reverse_indexer(
indexer: np.ndarray, # const intp_t[:]
length: int,
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 55819ebd1f15e..c483f35513a40 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -25,7 +25,6 @@ from cpython.object cimport (
Py_EQ,
PyObject,
PyObject_RichCompareBool,
- PyTypeObject,
)
from cpython.ref cimport Py_INCREF
from cpython.sequence cimport PySequence_Check
@@ -66,34 +65,8 @@ from numpy cimport (
)
cnp.import_array()
+from pandas._libs.interval import Interval
-cdef extern from "Python.h":
- # Note: importing extern-style allows us to declare these as nogil
- # functions, whereas `from cpython cimport` does not.
- bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
-
-cdef extern from "numpy/arrayobject.h":
- # cython's numpy.dtype specification is incorrect, which leads to
- # errors in issubclass(self.dtype.type, np.bool_), so we directly
- # include the correct version
- # https://github.com/cython/cython/issues/2022
-
- ctypedef class numpy.dtype [object PyArray_Descr]:
- # Use PyDataType_* macros when possible, however there are no macros
- # for accessing some of the fields, so some are defined. Please
- # ask on cython-dev if you need more.
- cdef:
- int type_num
- int itemsize "elsize"
- char byteorder
- object fields
- tuple names
-
- PyTypeObject PySignedIntegerArrType_Type
- PyTypeObject PyUnsignedIntegerArrType_Type
-
-cdef extern from "numpy/ndarrayobject.h":
- bint PyArray_CheckScalar(obj) nogil
cdef extern from "pandas/parser/pd_parser.h":
int floatify(object, float64_t *result, int *maybe_int) except -1
@@ -102,6 +75,7 @@ cdef extern from "pandas/parser/pd_parser.h":
PandasParser_IMPORT
from pandas._libs cimport util
+from pandas._libs.dtypes cimport uint8_int64_object_t
from pandas._libs.util cimport (
INT64_MAX,
INT64_MIN,
@@ -144,7 +118,7 @@ cdef:
object oINT64_MIN = INT64_MIN
object oUINT64_MAX = UINT64_MAX
- float64_t NaN = np.NaN
+ float64_t NaN = np.nan
# python-visible
i8max = INT64_MAX
@@ -255,7 +229,7 @@ def is_scalar(val: object) -> bool:
# Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number
return (PyNumber_Check(val)
or is_period_object(val)
- or is_interval(val)
+ or isinstance(val, Interval)
or is_offset_object(val))
@@ -271,7 +245,7 @@ cdef int64_t get_itemsize(object val):
-------
is_ndarray : bool
"""
- if PyArray_CheckScalar(val):
+ if cnp.PyArray_CheckScalar(val):
return cnp.PyArray_DescrFromScalar(val).itemsize
else:
return -1
@@ -512,8 +486,7 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
@cython.wraparound(False)
@cython.boundscheck(False)
-# TODO(cython3): Can add const once cython#1772 is resolved
-def has_infs(floating[:] arr) -> bool:
+def has_infs(const floating[:] arr) -> bool:
cdef:
Py_ssize_t i, n = len(arr)
floating inf, neginf, val
@@ -530,6 +503,22 @@ def has_infs(floating[:] arr) -> bool:
return ret
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def has_only_ints_or_nan(floating[:] arr) -> bool:
+ cdef:
+ floating val
+ intp_t i
+
+ for i in range(len(arr)):
+ val = arr[i]
+ if (val != val) or (val == val):
+ continue
+ else:
+ return False
+ return True
+
+
def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
cdef:
Py_ssize_t i, n = len(indices)
@@ -760,6 +749,7 @@ cpdef ndarray[object] ensure_string_array(
cdef:
Py_ssize_t i = 0, n = len(arr)
bint already_copied = True
+ ndarray[object] newarr
if hasattr(arr, "to_numpy"):
@@ -775,7 +765,8 @@ cpdef ndarray[object] ensure_string_array(
result = np.asarray(arr, dtype="object")
- if copy and result is arr:
+ if copy and (result is arr or np.shares_memory(arr, result)):
+ # GH#54654
result = result.copy()
elif not copy and result is arr:
already_copied = False
@@ -784,8 +775,30 @@ cpdef ndarray[object] ensure_string_array(
# short-circuit, all elements are str
return result
+ if arr.dtype.kind == "f": # non-optimized path
+ for i in range(n):
+ val = arr[i]
+
+ if not already_copied:
+ result = result.copy()
+ already_copied = True
+
+ if not checknull(val):
+ # f"{val}" is not always equivalent to str(val) for floats
+ result[i] = str(val)
+ else:
+ if convert_na_value:
+ val = na_value
+ if skipna:
+ result[i] = val
+ else:
+ result[i] = f"{val}"
+
+ return result
+
+ newarr = np.asarray(arr, dtype=object)
for i in range(n):
- val = arr[i]
+ val = newarr[i]
if isinstance(val, str):
continue
@@ -824,10 +837,16 @@ def is_all_arraylike(obj: list) -> bool:
object val
bint all_arrays = True
+ from pandas.core.dtypes.generic import (
+ ABCIndex,
+ ABCMultiIndex,
+ ABCSeries,
+ )
+
for i in range(n):
val = obj[i]
- if not (isinstance(val, list) or
- util.is_array(val) or hasattr(val, "_data")):
+ if (not (isinstance(val, (list, ABCSeries, ABCIndex)) or util.is_array(val))
+ or isinstance(val, ABCMultiIndex)):
# TODO: EA?
# exclude tuples, frozensets as they may be contained in an Index
all_arrays = False
@@ -1143,6 +1162,17 @@ cpdef bint is_decimal(object obj):
cpdef bint is_interval(object obj):
+ import warnings
+
+ from pandas.util._exceptions import find_stack_level
+
+ warnings.warn(
+ # GH#55264
+ "is_interval is deprecated and will be removed in a future version. "
+ "Use isinstance(obj, pd.Interval) instead.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
return getattr(obj, "_typ", "_typ") == "interval"
@@ -1154,6 +1184,17 @@ def is_period(val: object) -> bool:
-------
bool
"""
+ import warnings
+
+ from pandas.util._exceptions import find_stack_level
+
+ warnings.warn(
+ # GH#55264
+ "is_period is deprecated and will be removed in a future version. "
+ "Use isinstance(obj, pd.Period) instead.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
return is_period_object(val)
@@ -1393,14 +1434,12 @@ cdef class Seen:
self.sint_ = (
self.sint_
or (oINT64_MIN <= val < 0)
- # Cython equivalent of `isinstance(val, np.signedinteger)`
- or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type)
+ or isinstance(val, cnp.signedinteger)
)
self.uint_ = (
self.uint_
or (oINT64_MAX < val <= oUINT64_MAX)
- # Cython equivalent of `isinstance(val, np.unsignedinteger)`
- or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type)
+ or isinstance(val, cnp.unsignedinteger)
)
@property
@@ -1610,7 +1649,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
if seen_val is False and skipna:
return "empty"
- if util.is_datetime64_object(val):
+ if cnp.is_datetime64_object(val):
if is_datetime64_array(values, skipna=skipna):
return "datetime64"
@@ -1678,7 +1717,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
if is_period_array(values, skipna=skipna):
return "period"
- elif is_interval(val):
+ elif isinstance(val, Interval):
if is_interval_array(values):
return "interval"
@@ -1694,7 +1733,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
cdef bint is_timedelta(object o):
- return PyDelta_Check(o) or util.is_timedelta64_object(o)
+ return PyDelta_Check(o) or cnp.is_timedelta64_object(o)
@cython.internal
@@ -1702,10 +1741,10 @@ cdef class Validator:
cdef:
Py_ssize_t n
- dtype dtype
+ cnp.dtype dtype
bint skipna
- def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
+ def __cinit__(self, Py_ssize_t n, cnp.dtype dtype=np.dtype(np.object_),
bint skipna=False):
self.n = n
self.dtype = dtype
@@ -1786,7 +1825,7 @@ cdef class BoolValidator(Validator):
return util.is_bool_object(value)
cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.bool_)
+ return cnp.PyDataType_ISBOOL(self.dtype)
cpdef bint is_bool_array(ndarray values, bint skipna=False):
@@ -1803,7 +1842,7 @@ cdef class IntegerValidator(Validator):
return util.is_integer_object(value)
cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.integer)
+ return cnp.PyDataType_ISINTEGER(self.dtype)
# Note: only python-exposed for tests
@@ -1835,7 +1874,7 @@ cdef class IntegerFloatValidator(Validator):
return util.is_integer_object(value) or util.is_float_object(value)
cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.integer)
+ return cnp.PyDataType_ISINTEGER(self.dtype)
cdef bint is_integer_float_array(ndarray values, bint skipna=True):
@@ -1852,7 +1891,7 @@ cdef class FloatValidator(Validator):
return util.is_float_object(value)
cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.floating)
+ return cnp.PyDataType_ISFLOAT(self.dtype)
# Note: only python-exposed for tests
@@ -1871,7 +1910,7 @@ cdef class ComplexValidator(Validator):
)
cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.complexfloating)
+ return cnp.PyDataType_ISCOMPLEX(self.dtype)
cdef bint is_complex_array(ndarray values):
@@ -1900,7 +1939,7 @@ cdef class StringValidator(Validator):
return isinstance(value, str)
cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.str_)
+ return self.dtype.type_num == cnp.NPY_UNICODE
cpdef bint is_string_array(ndarray values, bint skipna=False):
@@ -1917,7 +1956,7 @@ cdef class BytesValidator(Validator):
return isinstance(value, bytes)
cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.bytes_)
+ return self.dtype.type_num == cnp.NPY_STRING
cdef bint is_bytes_array(ndarray values, bint skipna=False):
@@ -1932,7 +1971,7 @@ cdef class TemporalValidator(Validator):
cdef:
bint all_generic_na
- def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
+ def __cinit__(self, Py_ssize_t n, cnp.dtype dtype=np.dtype(np.object_),
bint skipna=False):
self.n = n
self.dtype = dtype
@@ -1981,7 +2020,7 @@ cpdef bint is_datetime_array(ndarray values, bint skipna=True):
@cython.internal
cdef class Datetime64Validator(DatetimeValidator):
cdef bint is_value_typed(self, object value) except -1:
- return util.is_datetime64_object(value)
+ return cnp.is_datetime64_object(value)
# Note: only python-exposed for tests
@@ -1995,7 +2034,7 @@ cpdef bint is_datetime64_array(ndarray values, bint skipna=True):
@cython.internal
cdef class AnyDatetimeValidator(DatetimeValidator):
cdef bint is_value_typed(self, object value) except -1:
- return util.is_datetime64_object(value) or (
+ return cnp.is_datetime64_object(value) or (
PyDateTime_Check(value) and value.tzinfo is None
)
@@ -2153,7 +2192,7 @@ cpdef bint is_interval_array(ndarray values):
for i in range(n):
val = values[i]
- if is_interval(val):
+ if isinstance(val, Interval):
if closed is None:
closed = val.closed
numeric = (
@@ -2479,6 +2518,7 @@ def maybe_convert_objects(ndarray[object] objects,
ndarray[int64_t] ints
ndarray[uint64_t] uints
ndarray[uint8_t] bools
+ ndarray[uint8_t] mask
Seen seen = Seen()
object val
_TSObject tsobj
@@ -2578,7 +2618,7 @@ def maybe_convert_objects(ndarray[object] objects,
seen.complex_ = True
if not convert_numeric:
break
- elif PyDateTime_Check(val) or util.is_datetime64_object(val):
+ elif PyDateTime_Check(val) or cnp.is_datetime64_object(val):
# if we have an tz's attached then return the objects
if convert_non_numeric:
@@ -2591,6 +2631,7 @@ def maybe_convert_objects(ndarray[object] objects,
tsobj = convert_to_tsobject(val, None, None, 0, 0)
tsobj.ensure_reso(NPY_FR_ns)
except OutOfBoundsDatetime:
+ # e.g. test_out_of_s_bounds_datetime64
seen.object_ = True
break
else:
@@ -2612,7 +2653,7 @@ def maybe_convert_objects(ndarray[object] objects,
except (ValueError, TypeError):
seen.object_ = True
break
- elif is_interval(val):
+ elif isinstance(val, Interval):
if convert_non_numeric:
seen.interval_ = True
break
@@ -2626,6 +2667,9 @@ def maybe_convert_objects(ndarray[object] objects,
else:
seen.object_ = True
break
+ elif val is C_NA:
+ seen.object_ = True
+ continue
else:
seen.object_ = True
break
@@ -2681,14 +2725,11 @@ def maybe_convert_objects(ndarray[object] objects,
seen.object_ = True
elif seen.str_:
- if is_string_array(objects, skipna=True):
- if using_pyarrow_string_dtype():
- import pyarrow as pa
+ if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
+ from pandas.core.arrays.string_ import StringDtype
- from pandas.core.dtypes.dtypes import ArrowDtype
-
- dtype = ArrowDtype(pa.string())
- return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
+ dtype = StringDtype(storage="pyarrow_numpy")
+ return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
seen.object_ = True
elif seen.interval_:
@@ -2715,8 +2756,11 @@ def maybe_convert_objects(ndarray[object] objects,
res[:] = NPY_NAT
return res
elif dtype is not None:
- # EA, we don't expect to get here, but _could_ implement
- raise NotImplementedError(dtype)
+ # i.e. PeriodDtype, DatetimeTZDtype
+ cls = dtype.construct_array_type()
+ obj = cls._from_sequence([], dtype=dtype)
+ taker = -np.ones((objects).shape, dtype=np.intp)
+ return obj.take(taker, allow_fill=True)
else:
# we don't guess
seen.object_ = True
@@ -2816,11 +2860,14 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value.
NoDefault = Literal[_NoDefault.no_default]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
- object na_value=no_default, cnp.dtype dtype=np.dtype(object)
- ) -> np.ndarray:
+def map_infer_mask(
+ ndarray[object] arr,
+ object f,
+ const uint8_t[:] mask,
+ bint convert=True,
+ object na_value=no_default,
+ cnp.dtype dtype=np.dtype(object)
+) -> np.ndarray:
"""
Substitute for np.vectorize with pandas-friendly dtype inference.
@@ -2831,10 +2878,10 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
mask : ndarray
uint8 dtype ndarray indicating values not to apply `f` to.
convert : bool, default True
- Whether to call `maybe_convert_objects` on the resulting ndarray
+ Whether to call `maybe_convert_objects` on the resulting ndarray.
na_value : Any, optional
The result value to use for masked values. By default, the
- input value is used
+ input value is used.
dtype : numpy.dtype
The numpy dtype to use for the result ndarray.
@@ -2842,13 +2889,39 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
-------
np.ndarray
"""
+ cdef Py_ssize_t n = len(arr)
+ result = np.empty(n, dtype=dtype)
+
+ _map_infer_mask(
+ result,
+ arr,
+ f,
+ mask,
+ na_value,
+ )
+ if convert:
+ return maybe_convert_objects(result)
+ else:
+ return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def _map_infer_mask(
+ ndarray[uint8_int64_object_t] out,
+ ndarray[object] arr,
+ object f,
+ const uint8_t[:] mask,
+ object na_value=no_default,
+) -> None:
+ """
+ Helper for map_infer_mask, split off to use fused types based on the result.
+ """
cdef:
Py_ssize_t i, n
- ndarray result
object val
n = len(arr)
- result = np.empty(n, dtype=dtype)
for i in range(n):
if mask[i]:
if na_value is no_default:
@@ -2862,12 +2935,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
# unbox 0-dim arrays, GH#690
val = val.item()
- result[i] = val
-
- if convert:
- return maybe_convert_objects(result)
- else:
- return result
+ out[i] = val
@cython.boundscheck(False)
@@ -3023,7 +3091,7 @@ def to_object_array_tuples(rows: object) -> np.ndarray:
@cython.wraparound(False)
@cython.boundscheck(False)
-def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
+def fast_multiget(dict mapping, object[:] keys, default=np.nan) -> np.ndarray:
cdef:
Py_ssize_t i, n = len(keys)
object val
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index f302c649bc7bd..c27386743c6e9 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -61,15 +61,16 @@ subdir('tslibs')
libs_sources = {
# Dict of extension name -> dict of {sources, include_dirs, and deps}
# numpy include dir is implicitly included
- 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]},
+ 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep},
'arrays': {'sources': ['arrays.pyx']},
'groupby': {'sources': ['groupby.pyx']},
'hashing': {'sources': ['hashing.pyx']},
- 'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]},
- 'index': {'sources': ['index.pyx', _index_class_helper]},
+ 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep},
+ 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep},
'indexing': {'sources': ['indexing.pyx']},
'internals': {'sources': ['internals.pyx']},
- 'interval': {'sources': ['interval.pyx', _intervaltree_helper]},
+ 'interval': {'sources': ['interval.pyx', _intervaltree_helper],
+ 'deps': _khash_primitive_helper_dep},
'join': {'sources': ['join.pyx', _khash_primitive_helper],
'deps': _khash_primitive_helper_dep},
'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']},
@@ -100,12 +101,20 @@ libs_sources = {
'writers': {'sources': ['writers.pyx']}
}
+cython_args = [
+ '--include-dir',
+ meson.current_build_dir(),
+ '-X always_allow_keywords=true'
+]
+if get_option('buildtype') == 'debug'
+ cython_args += ['--gdb']
+endif
foreach ext_name, ext_dict : libs_sources
py.extension_module(
ext_name,
ext_dict.get('sources'),
- cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'],
+ cython_args: cython_args,
include_directories: [inc_np, inc_pd],
dependencies: ext_dict.get('deps', ''),
subdir: 'pandas/_libs',
@@ -113,8 +122,40 @@ foreach ext_name, ext_dict : libs_sources
)
endforeach
-py.install_sources('__init__.py',
- pure: false,
- subdir: 'pandas/_libs')
+# Basically just __init__.py and the .pyi files
+sources_to_install = [
+ '__init__.py',
+ 'algos.pyi',
+ 'arrays.pyi',
+ 'byteswap.pyi',
+ 'groupby.pyi',
+ 'hashing.pyi',
+ 'hashtable.pyi',
+ 'index.pyi',
+ 'indexing.pyi',
+ 'internals.pyi',
+ 'interval.pyi',
+ 'join.pyi',
+ 'json.pyi',
+ 'lib.pyi',
+ 'missing.pyi',
+ 'ops.pyi',
+ 'ops_dispatch.pyi',
+ 'parsers.pyi',
+ 'properties.pyi',
+ 'reshape.pyi',
+ 'sas.pyi',
+ 'sparse.pyi',
+ 'testing.pyi',
+ 'tslib.pyi',
+ 'writers.pyi'
+]
+
+foreach source: sources_to_install
+ py.install_sources(
+ source,
+ subdir: 'pandas/_libs'
+ )
+endforeach
subdir('window')
diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi
index d5c9f1342a089..282dcee3ed6cf 100644
--- a/pandas/_libs/missing.pyi
+++ b/pandas/_libs/missing.pyi
@@ -14,4 +14,3 @@ def isneginf_scalar(val: object) -> bool: ...
def checknull(val: object, inf_as_na: bool = ...) -> bool: ...
def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
-def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index e3e7d8daa03e1..0fd1c2b21d5cd 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -32,8 +32,6 @@ from pandas._libs.tslibs.nattype cimport (
)
from pandas._libs.tslibs.np_datetime cimport (
get_datetime64_unit,
- get_datetime64_value,
- get_timedelta64_value,
import_pandas_datetime,
)
@@ -120,18 +118,18 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False
and util.is_complex_object(right)
and util.is_nan(right)
)
- elif util.is_datetime64_object(left):
+ elif cnp.is_datetime64_object(left):
return (
- get_datetime64_value(left) == NPY_NAT
- and util.is_datetime64_object(right)
- and get_datetime64_value(right) == NPY_NAT
+ cnp.get_datetime64_value(left) == NPY_NAT
+ and cnp.is_datetime64_object(right)
+ and cnp.get_datetime64_value(right) == NPY_NAT
and get_datetime64_unit(left) == get_datetime64_unit(right)
)
- elif util.is_timedelta64_object(left):
+ elif cnp.is_timedelta64_object(left):
return (
- get_timedelta64_value(left) == NPY_NAT
- and util.is_timedelta64_object(right)
- and get_timedelta64_value(right) == NPY_NAT
+ cnp.get_timedelta64_value(left) == NPY_NAT
+ and cnp.is_timedelta64_object(right)
+ and cnp.get_timedelta64_value(right) == NPY_NAT
and get_datetime64_unit(left) == get_datetime64_unit(right)
)
elif is_decimal_na(left):
@@ -169,10 +167,10 @@ cpdef bint checknull(object val, bint inf_as_na=False):
elif inf_as_na:
return val == INF or val == NEGINF
return False
- elif util.is_timedelta64_object(val):
- return get_timedelta64_value(val) == NPY_NAT
- elif util.is_datetime64_object(val):
- return get_datetime64_value(val) == NPY_NAT
+ elif cnp.is_timedelta64_object(val):
+ return cnp.get_timedelta64_value(val) == NPY_NAT
+ elif cnp.is_datetime64_object(val):
+ return cnp.get_datetime64_value(val) == NPY_NAT
else:
return is_decimal_na(val)
@@ -255,31 +253,6 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def is_float_nan(values: ndarray) -> ndarray:
- """
- True for elements which correspond to a float nan
-
- Returns
- -------
- ndarray[bool]
- """
- cdef:
- ndarray[uint8_t] result
- Py_ssize_t i, N
- object val
-
- N = len(values)
- result = np.zeros(N, dtype=np.uint8)
-
- for i in range(N):
- val = values[i]
- if util.is_nan(val):
- result[i] = True
- return result.view(bool)
-
-
@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
index 515f7aa53ba15..6738a1dff4a9e 100644
--- a/pandas/_libs/ops.pyi
+++ b/pandas/_libs/ops.pyi
@@ -37,8 +37,8 @@ def vec_binop(
@overload
def maybe_convert_bool(
arr: npt.NDArray[np.object_],
- true_values: Iterable = ...,
- false_values: Iterable = ...,
+ true_values: Iterable | None = None,
+ false_values: Iterable | None = None,
convert_to_masked_nullable: Literal[False] = ...,
) -> tuple[np.ndarray, None]: ...
@overload
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 6d66e21ce49f5..82e9812094af2 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -6,7 +6,6 @@ from csv import (
QUOTE_NONE,
QUOTE_NONNUMERIC,
)
-import sys
import time
import warnings
@@ -35,6 +34,7 @@ from cpython.unicode cimport (
PyUnicode_AsUTF8String,
PyUnicode_Decode,
PyUnicode_DecodeUTF8,
+ PyUnicode_FromString,
)
from cython cimport Py_ssize_t
from libc.stdlib cimport free
@@ -45,11 +45,6 @@ from libc.string cimport (
)
-cdef extern from "Python.h":
- # TODO(cython3): get this from cpython.unicode
- object PyUnicode_FromString(char *v)
-
-
import numpy as np
cimport numpy as cnp
@@ -157,9 +152,9 @@ cdef extern from "pandas/parser/tokenizer.h":
WARN,
SKIP
- ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
+ ctypedef char* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors)
- ctypedef int (*io_cleanup)(void *src)
+ ctypedef void (*io_cleanup)(void *src)
ctypedef struct parser_t:
void *source
@@ -229,7 +224,7 @@ cdef extern from "pandas/parser/tokenizer.h":
# pick one, depending on whether the converter requires GIL
double (*double_converter)(const char *, char **,
char, char, char,
- int, int *, int *) nogil
+ int, int *, int *) noexcept nogil
# error handling
char *warn_msg
@@ -252,9 +247,9 @@ cdef extern from "pandas/parser/tokenizer.h":
cdef extern from "pandas/parser/pd_parser.h":
void *new_rd_source(object obj) except NULL
- int del_rd_source(void *src)
+ void del_rd_source(void *src)
- void* buffer_rd_bytes(void *source, size_t nbytes,
+ char* buffer_rd_bytes(void *source, size_t nbytes,
size_t *bytes_read, int *status, const char *encoding_errors)
void uint_state_init(uint_state *self)
@@ -271,7 +266,7 @@ cdef extern from "pandas/parser/pd_parser.h":
void parser_del(parser_t *self) nogil
int parser_add_skiprow(parser_t *self, int64_t row)
- int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
+ void parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
void parser_set_default_options(parser_t *self)
@@ -323,13 +318,13 @@ cdef double round_trip_wrapper(const char *p, char **q, char decimal,
return round_trip(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)
-cdef void* buffer_rd_bytes_wrapper(void *source, size_t nbytes,
+cdef char* buffer_rd_bytes_wrapper(void *source, size_t nbytes,
size_t *bytes_read, int *status,
const char *encoding_errors) noexcept:
return buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors)
-cdef int del_rd_source_wrapper(void *src) noexcept:
- return del_rd_source(src)
+cdef void del_rd_source_wrapper(void *src) noexcept:
+ del_rd_source(src)
cdef class TextReader:
@@ -880,9 +875,15 @@ cdef class TextReader:
cdef _check_tokenize_status(self, int status):
if self.parser.warn_msg != NULL:
- print(PyUnicode_DecodeUTF8(
- self.parser.warn_msg, strlen(self.parser.warn_msg),
- self.encoding_errors), file=sys.stderr)
+ warnings.warn(
+ PyUnicode_DecodeUTF8(
+ self.parser.warn_msg,
+ strlen(self.parser.warn_msg),
+ self.encoding_errors
+ ),
+ ParserWarning,
+ stacklevel=find_stack_level()
+ )
free(self.parser.warn_msg)
self.parser.warn_msg = NULL
@@ -988,7 +989,7 @@ cdef class TextReader:
missing_usecols = [col for col in self.usecols if col >= num_cols]
if missing_usecols:
raise ParserError(
- "Defining usecols without of bounds indices is not allowed. "
+ "Defining usecols with out-of-bounds indices is not allowed. "
f"{missing_usecols} are out of bounds.",
)
@@ -1470,13 +1471,15 @@ def _maybe_upcast(
elif arr.dtype == np.object_:
if use_dtype_backend:
- arr = StringDtype().construct_array_type()._from_sequence(arr)
+ dtype = StringDtype()
+ cls = dtype.construct_array_type()
+ arr = cls._from_sequence(arr, dtype=dtype)
if use_dtype_backend and dtype_backend == "pyarrow":
import pyarrow as pa
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
- arr = arr.to_numpy()
+ arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
return arr
@@ -1600,7 +1603,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
# -> ndarray[f'|S{width}']
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
- int64_t line_end, int64_t width):
+ int64_t line_end, int64_t width) noexcept:
cdef:
char *data
ndarray result
@@ -1616,7 +1619,7 @@ cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
cdef void _to_fw_string_nogil(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
- size_t width, char *data) nogil:
+ size_t width, char *data) noexcept nogil:
cdef:
int64_t i
coliter_t it
@@ -1672,7 +1675,7 @@ cdef _try_double(parser_t *parser, int64_t col,
cdef int _try_double_nogil(parser_t *parser,
float64_t (*double_converter)(
const char *, char **, char,
- char, char, int, int *, int *) nogil,
+ char, char, int, int *, int *) noexcept nogil,
int64_t col, int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset,
bint use_na_flist,
diff --git a/pandas/_libs/sas.pyx b/pandas/_libs/sas.pyx
index cf2e0e4d80bb5..9e1af2cb9c3e7 100644
--- a/pandas/_libs/sas.pyx
+++ b/pandas/_libs/sas.pyx
@@ -62,6 +62,7 @@ cdef buf_free(Buffer buf):
# algorithm. It is partially documented here:
#
# https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
+# Licence at LICENSES/SAS7BDAT_LICENSE
cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0:
cdef:
diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi
index 9e5cecc61e5ca..536265b25425e 100644
--- a/pandas/_libs/sparse.pyi
+++ b/pandas/_libs/sparse.pyi
@@ -39,6 +39,10 @@ class BlockIndex(SparseIndex):
self, length: int, blocs: np.ndarray, blengths: np.ndarray
) -> None: ...
+ # Override to have correct parameters
+ def intersect(self, other: SparseIndex) -> Self: ...
+ def make_union(self, y: SparseIndex) -> Self: ...
+
def make_mask_object_ndarray(
arr: npt.NDArray[np.object_], fill_value
) -> npt.NDArray[np.bool_]: ...
diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c
index 3bc3275be1cfe..99081746b2c97 100644
--- a/pandas/_libs/src/datetime/date_conversions.c
+++ b/pandas/_libs/src/datetime/date_conversions.c
@@ -20,84 +20,77 @@ The full license is in the LICENSE file, distributed with this software.
*
* Mutates the provided value directly. Returns 0 on success, non-zero on error.
*/
-int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
- switch (unit) {
- case NPY_FR_ns:
- break;
- case NPY_FR_us:
- *value /= 1000LL;
- break;
- case NPY_FR_ms:
- *value /= 1000000LL;
- break;
- case NPY_FR_s:
- *value /= 1000000000LL;
- break;
- default:
- return -1;
- }
+int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit) {
+ switch (unit) {
+ case NPY_FR_ns:
+ break;
+ case NPY_FR_us:
+ *value /= 1000LL;
+ break;
+ case NPY_FR_ms:
+ *value /= 1000000LL;
+ break;
+ case NPY_FR_s:
+ *value /= 1000000000LL;
+ break;
+ default:
+ return -1;
+ }
- return 0;
+ return 0;
}
/* Converts the int64_t representation of a datetime to ISO; mutates len */
-char *int64ToIso(int64_t value,
- NPY_DATETIMEUNIT valueUnit,
- NPY_DATETIMEUNIT base,
- size_t *len) {
- npy_datetimestruct dts;
- int ret_code;
+char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit,
+ NPY_DATETIMEUNIT base, size_t *len) {
+ npy_datetimestruct dts;
+ int ret_code;
- pandas_datetime_to_datetimestruct(value, valueUnit, &dts);
+ pandas_datetime_to_datetimestruct(value, valueUnit, &dts);
- *len = (size_t)get_datetime_iso_8601_strlen(0, base);
- char *result = PyObject_Malloc(*len);
+ *len = (size_t)get_datetime_iso_8601_strlen(0, base);
+ char *result = PyObject_Malloc(*len);
- if (result == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- // datetime64 is always naive
- ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
- if (ret_code != 0) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert datetime value to string");
- PyObject_Free(result);
- }
+ if (result == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ // datetime64 is always naive
+ ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
+ if (ret_code != 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert datetime value to string");
+ PyObject_Free(result);
+ }
- // Note that get_datetime_iso_8601_strlen just gives a generic size
- // for ISO string conversion, not the actual size used
- *len = strlen(result);
- return result;
-}
-
-npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
- scaleNanosecToUnit(&dt, base);
- return dt;
+ // Note that get_datetime_iso_8601_strlen just gives a generic size
+ // for ISO string conversion, not the actual size used
+ *len = strlen(result);
+ return result;
}
/* Converts the int64_t representation of a duration to ISO; mutates len */
char *int64ToIsoDuration(int64_t value, size_t *len) {
- pandas_timedeltastruct tds;
- int ret_code;
+ pandas_timedeltastruct tds;
+ int ret_code;
- pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);
+ pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);
- // Max theoretical length of ISO Duration with 64 bit day
- // as the largest unit is 70 characters + 1 for a null terminator
- char *result = PyObject_Malloc(71);
- if (result == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
+ // Max theoretical length of ISO Duration with 64 bit day
+ // as the largest unit is 70 characters + 1 for a null terminator
+ char *result = PyObject_Malloc(71);
+ if (result == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
- ret_code = make_iso_8601_timedelta(&tds, result, len);
- if (ret_code == -1) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert timedelta value to string");
- PyObject_Free(result);
- return NULL;
- }
+ ret_code = make_iso_8601_timedelta(&tds, result, len);
+ if (ret_code == -1) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert timedelta value to string");
+ PyObject_Free(result);
+ return NULL;
+ }
- return result;
+ return result;
}
diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c
index fc2cbcab90174..19de51be6e1b2 100644
--- a/pandas/_libs/src/datetime/pd_datetime.c
+++ b/pandas/_libs/src/datetime/pd_datetime.c
@@ -21,7 +21,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
#include "datetime.h"
#include "pandas/datetime/pd_datetime.h"
-
+#include "pandas/portable.h"
static void pandas_datetime_destructor(PyObject *op) {
void *ptr = PyCapsule_GetPointer(op, PandasDateTime_CAPSULE_NAME);
@@ -42,77 +42,77 @@ static void pandas_datetime_destructor(PyObject *op) {
* if obj doesn't have the needed date or datetime attributes.
*/
static int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
- npy_datetimestruct *out) {
- // Assumes that obj is a valid datetime object
- PyObject *tmp;
- PyObject *obj = (PyObject*)dtobj;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(npy_datetimestruct));
- out->month = 1;
- out->day = 1;
-
- out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year"));
- out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month"));
- out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day"));
-
- // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use
- // PyDateTime_Check here, and less verbose attribute lookups.
-
- /* Check for time attributes (if not there, return success as a date) */
- if (!PyObject_HasAttrString(obj, "hour") ||
- !PyObject_HasAttrString(obj, "minute") ||
- !PyObject_HasAttrString(obj, "second") ||
- !PyObject_HasAttrString(obj, "microsecond")) {
- return 0;
- }
+ npy_datetimestruct *out) {
+ // Assumes that obj is a valid datetime object
+ PyObject *tmp;
+ PyObject *obj = (PyObject *)dtobj;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(npy_datetimestruct));
+ out->month = 1;
+ out->day = 1;
+
+ out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year"));
+ out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month"));
+ out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day"));
+
+ // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use
+ // PyDateTime_Check here, and less verbose attribute lookups.
+
+ /* Check for time attributes (if not there, return success as a date) */
+ if (!PyObject_HasAttrString(obj, "hour") ||
+ !PyObject_HasAttrString(obj, "minute") ||
+ !PyObject_HasAttrString(obj, "second") ||
+ !PyObject_HasAttrString(obj, "microsecond")) {
+ return 0;
+ }
+
+ out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour"));
+ out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute"));
+ out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
+ out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
- out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour"));
- out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute"));
- out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
- out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
-
- if (PyObject_HasAttrString(obj, "tzinfo")) {
- PyObject *offset = extract_utc_offset(obj);
- /* Apply the time zone offset if datetime obj is tz-aware */
- if (offset != NULL) {
- if (offset == Py_None) {
- Py_DECREF(offset);
- return 0;
- }
- PyObject *tmp_int;
- int seconds_offset, minutes_offset;
- /*
- * The timedelta should have a function "total_seconds"
- * which contains the value we want.
- */
- tmp = PyObject_CallMethod(offset, "total_seconds", "");
- Py_DECREF(offset);
- if (tmp == NULL) {
- return -1;
- }
- tmp_int = PyNumber_Long(tmp);
- if (tmp_int == NULL) {
- Py_DECREF(tmp);
- return -1;
- }
- seconds_offset = PyLong_AsLong(tmp_int);
- if (seconds_offset == -1 && PyErr_Occurred()) {
- Py_DECREF(tmp_int);
- Py_DECREF(tmp);
- return -1;
- }
- Py_DECREF(tmp_int);
- Py_DECREF(tmp);
-
- /* Convert to a minutes offset and apply it */
- minutes_offset = seconds_offset / 60;
-
- add_minutes_to_datetimestruct(out, -minutes_offset);
- }
+ if (PyObject_HasAttrString(obj, "tzinfo")) {
+ PyObject *offset = extract_utc_offset(obj);
+ /* Apply the time zone offset if datetime obj is tz-aware */
+ if (offset != NULL) {
+ if (offset == Py_None) {
+ Py_DECREF(offset);
+ return 0;
+ }
+ PyObject *tmp_int;
+ int seconds_offset, minutes_offset;
+ /*
+ * The timedelta should have a function "total_seconds"
+ * which contains the value we want.
+ */
+ tmp = PyObject_CallMethod(offset, "total_seconds", "");
+ Py_DECREF(offset);
+ if (tmp == NULL) {
+ return -1;
+ }
+ tmp_int = PyNumber_Long(tmp);
+ if (tmp_int == NULL) {
+ Py_DECREF(tmp);
+ return -1;
+ }
+ seconds_offset = PyLong_AsLong(tmp_int);
+ if (seconds_offset == -1 && PyErr_Occurred()) {
+ Py_DECREF(tmp_int);
+ Py_DECREF(tmp);
+ return -1;
+ }
+ Py_DECREF(tmp_int);
+ Py_DECREF(tmp);
+
+ /* Convert to a minutes offset and apply it */
+ minutes_offset = seconds_offset / 60;
+
+ add_minutes_to_datetimestruct(out, -minutes_offset);
}
+ }
- return 0;
+ return 0;
}
// Converts a Python object representing a Date / Datetime to ISO format
@@ -120,69 +120,76 @@ static int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
// len is mutated to save the length of the returned string
static char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
- size_t *len) {
- npy_datetimestruct dts;
- int ret;
-
- ret = convert_pydatetime_to_datetimestruct(obj, &dts);
- if (ret != 0) {
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert PyDateTime to numpy datetime");
- }
- return NULL;
+ size_t *len) {
+ npy_datetimestruct dts;
+ int ret;
+
+ ret = convert_pydatetime_to_datetimestruct(obj, &dts);
+ if (ret != 0) {
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert PyDateTime to numpy datetime");
}
+ return NULL;
+ }
- *len = (size_t)get_datetime_iso_8601_strlen(0, base);
- char *result = PyObject_Malloc(*len);
- // Check to see if PyDateTime has a timezone.
- // Don't convert to UTC if it doesn't.
- int is_tz_aware = 0;
- if (PyObject_HasAttrString(obj, "tzinfo")) {
- PyObject *offset = extract_utc_offset(obj);
- if (offset == NULL) {
- PyObject_Free(result);
- return NULL;
- }
- is_tz_aware = offset != Py_None;
- Py_DECREF(offset);
+ *len = (size_t)get_datetime_iso_8601_strlen(0, base);
+ char *result = PyObject_Malloc(*len);
+ // Check to see if PyDateTime has a timezone.
+ // Don't convert to UTC if it doesn't.
+ int is_tz_aware = 0;
+ if (PyObject_HasAttrString(obj, "tzinfo")) {
+ PyObject *offset = extract_utc_offset(obj);
+ if (offset == NULL) {
+ PyObject_Free(result);
+ return NULL;
}
- ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
+ is_tz_aware = offset != Py_None;
+ Py_DECREF(offset);
+ }
+ ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
- if (ret != 0) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert datetime value to string");
- PyObject_Free(result);
- return NULL;
- }
+ if (ret != 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert datetime value to string");
+ PyObject_Free(result);
+ return NULL;
+ }
- // Note that get_datetime_iso_8601_strlen just gives a generic size
- // for ISO string conversion, not the actual size used
- *len = strlen(result);
- return result;
+ // Note that get_datetime_iso_8601_strlen just gives a generic size
+ // for ISO string conversion, not the actual size used
+ *len = strlen(result);
+ return result;
}
// Convert a Python Date/Datetime to Unix epoch with resolution base
static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) {
- npy_datetimestruct dts;
- int ret;
-
- ret = convert_pydatetime_to_datetimestruct(dt, &dts);
- if (ret != 0) {
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert PyDateTime to numpy datetime");
- }
- // TODO(username): is setting errMsg required?
- // ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- // return NULL;
+ npy_datetimestruct dts;
+ int ret;
+
+ ret = convert_pydatetime_to_datetimestruct(dt, &dts);
+ if (ret != 0) {
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert PyDateTime to numpy datetime");
+
+ return -1;
}
+ }
+
+ int64_t npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
+ if (scaleNanosecToUnit(&npy_dt, base) == -1) {
+ PyErr_Format(PyExc_ValueError,
+ "Call to scaleNanosecToUnit with value %" NPY_DATETIME_FMT
+ " and base %d failed",
+ npy_dt, base);
- npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
- return NpyDateTimeToEpoch(npy_dt, base);
+ return -1;
+ }
+ return npy_dt;
}
-static int pandas_datetime_exec(PyObject *module) {
+static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) {
PyDateTime_IMPORT;
PandasDateTime_CAPI *capi = PyMem_Malloc(sizeof(PandasDateTime_CAPI));
if (capi == NULL) {
@@ -192,7 +199,6 @@ static int pandas_datetime_exec(PyObject *module) {
capi->npy_datetimestruct_to_datetime = npy_datetimestruct_to_datetime;
capi->scaleNanosecToUnit = scaleNanosecToUnit;
capi->int64ToIso = int64ToIso;
- capi->NpyDateTimeToEpoch = NpyDateTimeToEpoch;
capi->PyDateTimeToIso = PyDateTimeToIso;
capi->PyDateTimeToEpoch = PyDateTimeToEpoch;
capi->int64ToIsoDuration = int64ToIsoDuration;
diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c
index e00c5c1e807a7..851901481d222 100644
--- a/pandas/_libs/src/parser/io.c
+++ b/pandas/_libs/src/parser/io.c
@@ -14,19 +14,19 @@ The full license is in the LICENSE file, distributed with this software.
*/
void *new_rd_source(PyObject *obj) {
- rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
-
- if (rds == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- /* hold on to this object */
- Py_INCREF(obj);
- rds->obj = obj;
- rds->buffer = NULL;
- rds->position = 0;
-
- return (void *)rds;
+ rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
+
+ if (rds == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ /* hold on to this object */
+ Py_INCREF(obj);
+ rds->obj = obj;
+ rds->buffer = NULL;
+ rds->position = 0;
+
+ return (void *)rds;
}
/*
@@ -35,12 +35,10 @@ void *new_rd_source(PyObject *obj) {
*/
-int del_rd_source(void *rds) {
- Py_XDECREF(RDS(rds)->obj);
- Py_XDECREF(RDS(rds)->buffer);
- free(rds);
-
- return 0;
+void del_rd_source(void *rds) {
+ Py_XDECREF(RDS(rds)->obj);
+ Py_XDECREF(RDS(rds)->buffer);
+ free(rds);
}
/*
@@ -49,59 +47,53 @@ int del_rd_source(void *rds) {
*/
-void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
+char *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors) {
- PyGILState_STATE state;
- PyObject *result, *func, *args, *tmp;
-
- void *retval;
-
- size_t length;
- rd_source *src = RDS(source);
- state = PyGILState_Ensure();
-
- /* delete old object */
- Py_XDECREF(src->buffer);
- src->buffer = NULL;
- args = Py_BuildValue("(i)", nbytes);
-
- func = PyObject_GetAttrString(src->obj, "read");
-
- /* Note: PyObject_CallObject requires the GIL */
- result = PyObject_CallObject(func, args);
- Py_XDECREF(args);
- Py_XDECREF(func);
-
- if (result == NULL) {
- PyGILState_Release(state);
- *bytes_read = 0;
- *status = CALLING_READ_FAILED;
- return NULL;
- } else if (!PyBytes_Check(result)) {
- tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
- Py_DECREF(result);
- if (tmp == NULL) {
- PyGILState_Release(state);
- return NULL;
- }
- result = tmp;
- }
+ rd_source *src = RDS(source);
+ PyGILState_STATE state = PyGILState_Ensure();
- length = PySequence_Length(result);
+ /* delete old object */
+ Py_XDECREF(src->buffer);
+ src->buffer = NULL;
+ PyObject *args = Py_BuildValue("(i)", nbytes);
- if (length == 0)
- *status = REACHED_EOF;
- else
- *status = 0;
+ PyObject *func = PyObject_GetAttrString(src->obj, "read");
- /* hang on to the Python object */
- src->buffer = result;
- retval = (void *)PyBytes_AsString(result);
+ /* Note: PyObject_CallObject requires the GIL */
+ PyObject *result = PyObject_CallObject(func, args);
+ Py_XDECREF(args);
+ Py_XDECREF(func);
+ if (result == NULL) {
PyGILState_Release(state);
+ *bytes_read = 0;
+ *status = CALLING_READ_FAILED;
+ return NULL;
+ } else if (!PyBytes_Check(result)) {
+ PyObject *tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
+ Py_DECREF(result);
+ if (tmp == NULL) {
+ PyGILState_Release(state);
+ return NULL;
+ }
+ result = tmp;
+ }
+
+ const size_t length = PySequence_Length(result);
+
+ if (length == 0)
+ *status = REACHED_EOF;
+ else
+ *status = 0;
+
+ /* hang on to the Python object */
+ src->buffer = result;
+ char *retval = PyBytes_AsString(result);
+
+ PyGILState_Release(state);
- /* TODO: more error handling */
- *bytes_read = length;
+ /* TODO: more error handling */
+ *bytes_read = length;
- return retval;
+ return retval;
}
diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c
index c429f17c1cb8b..48f3cd14cbc30 100644
--- a/pandas/_libs/src/parser/pd_parser.c
+++ b/pandas/_libs/src/parser/pd_parser.c
@@ -10,9 +10,10 @@ Distributed under the terms of the BSD Simplified License.
#include "pandas/parser/pd_parser.h"
#include "pandas/parser/io.h"
+#include "pandas/portable.h"
static int to_double(char *item, double *p_value, char sci, char decimal,
- int *maybe_int) {
+ int *maybe_int) {
char *p_end = NULL;
int error = 0;
@@ -24,7 +25,6 @@ static int to_double(char *item, double *p_value, char sci, char decimal,
}
static int floatify(PyObject *str, double *result, int *maybe_int) {
- int status;
char *data;
PyObject *tmp = NULL;
const char sci = 'E';
@@ -43,7 +43,7 @@ static int floatify(PyObject *str, double *result, int *maybe_int) {
return -1;
}
- status = to_double(data, result, sci, dec, maybe_int);
+ const int status = to_double(data, result, sci, dec, maybe_int);
if (!status) {
/* handle inf/-inf infinity/-infinity */
@@ -95,13 +95,12 @@ static int floatify(PyObject *str, double *result, int *maybe_int) {
return -1;
}
-
static void pandas_parser_destructor(PyObject *op) {
void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME);
PyMem_Free(ptr);
}
-static int pandas_parser_exec(PyObject *module) {
+static int pandas_parser_exec(PyObject *Py_UNUSED(module)) {
PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI));
if (capi == NULL) {
PyErr_NoMemory();
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index abd3fb9e1fef3..0e4188bea4dc7 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -16,29 +16,31 @@ Python's built-in csv module and Warren Weckesser's textreader project on
GitHub. See Python Software Foundation License and BSD licenses for these.
*/
-
#include "pandas/parser/tokenizer.h"
+#include "pandas/portable.h"
#include
#include
#include
+#include
#include "pandas/portable.h"
+#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
int64_t start) {
- // column i, starting at 0
- self->words = parser->words;
- self->col = i;
- self->line_start = parser->line_start + start;
+ // column i, starting at 0
+ self->words = parser->words;
+ self->col = i;
+ self->line_start = parser->line_start + start;
}
static void free_if_not_null(void **ptr) {
- TRACE(("free_if_not_null %p\n", *ptr))
- if (*ptr != NULL) {
- free(*ptr);
- *ptr = NULL;
- }
+ TRACE(("free_if_not_null %p\n", *ptr))
+ if (*ptr != NULL) {
+ free(*ptr);
+ *ptr = NULL;
+ }
}
/*
@@ -49,542 +51,498 @@ static void free_if_not_null(void **ptr) {
static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity,
int64_t space, int64_t elsize, int *error) {
- uint64_t cap = *capacity;
- void *newbuffer = buffer;
-
- // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
- while ((length + space >= cap) && (newbuffer != NULL)) {
- cap = cap ? cap << 1 : 2;
- buffer = newbuffer;
- newbuffer = realloc(newbuffer, elsize * cap);
- }
-
- if (newbuffer == NULL) {
- // realloc failed so don't change *capacity, set *error to errno
- // and return the last good realloc'd buffer so it can be freed
- *error = errno;
- newbuffer = buffer;
- } else {
- // realloc worked, update *capacity and set *error to 0
- // sigh, multiple return values
- *capacity = cap;
- *error = 0;
- }
- return newbuffer;
+ uint64_t cap = *capacity;
+ void *newbuffer = buffer;
+
+ // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
+ while ((length + space >= cap) && (newbuffer != NULL)) {
+ cap = cap ? cap << 1 : 2;
+ buffer = newbuffer;
+ newbuffer = realloc(newbuffer, elsize * cap);
+ }
+
+ if (newbuffer == NULL) {
+ // realloc failed so don't change *capacity, set *error to errno
+ // and return the last good realloc'd buffer so it can be freed
+ *error = errno;
+ newbuffer = buffer;
+ } else {
+ // realloc worked, update *capacity and set *error to 0
+ // sigh, multiple return values
+ *capacity = cap;
+ *error = 0;
+ }
+ return newbuffer;
}
void parser_set_default_options(parser_t *self) {
- self->decimal = '.';
- self->sci = 'E';
+ self->decimal = '.';
+ self->sci = 'E';
- // For tokenization
- self->state = START_RECORD;
+ // For tokenization
+ self->state = START_RECORD;
- self->delimiter = ','; // XXX
- self->delim_whitespace = 0;
+ self->delimiter = ','; // XXX
+ self->delim_whitespace = 0;
- self->doublequote = 0;
- self->quotechar = '"';
- self->escapechar = 0;
+ self->doublequote = 0;
+ self->quotechar = '"';
+ self->escapechar = 0;
- self->lineterminator = '\0'; /* NUL->standard logic */
+ self->lineterminator = '\0'; /* NUL->standard logic */
- self->skipinitialspace = 0;
- self->quoting = QUOTE_MINIMAL;
- self->allow_embedded_newline = 1;
+ self->skipinitialspace = 0;
+ self->quoting = QUOTE_MINIMAL;
+ self->allow_embedded_newline = 1;
- self->expected_fields = -1;
- self->on_bad_lines = ERROR;
+ self->expected_fields = -1;
+ self->on_bad_lines = ERROR;
- self->commentchar = '#';
- self->thousands = '\0';
+ self->commentchar = '#';
+ self->thousands = '\0';
- self->skipset = NULL;
- self->skipfunc = NULL;
- self->skip_first_N_rows = -1;
- self->skip_footer = 0;
+ self->skipset = NULL;
+ self->skipfunc = NULL;
+ self->skip_first_N_rows = -1;
+ self->skip_footer = 0;
}
parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); }
-int parser_clear_data_buffers(parser_t *self) {
- free_if_not_null((void *)&self->stream);
- free_if_not_null((void *)&self->words);
- free_if_not_null((void *)&self->word_starts);
- free_if_not_null((void *)&self->line_start);
- free_if_not_null((void *)&self->line_fields);
- return 0;
-}
+static void parser_cleanup(parser_t *self) {
+ // XXX where to put this
+ free_if_not_null((void *)&self->error_msg);
+ free_if_not_null((void *)&self->warn_msg);
-int parser_cleanup(parser_t *self) {
- int status = 0;
-
- // XXX where to put this
- free_if_not_null((void *)&self->error_msg);
- free_if_not_null((void *)&self->warn_msg);
-
- if (self->skipset != NULL) {
- kh_destroy_int64((kh_int64_t *)self->skipset);
- self->skipset = NULL;
- }
-
- if (parser_clear_data_buffers(self) < 0) {
- status = -1;
- }
-
- if (self->cb_cleanup != NULL) {
- if (self->cb_cleanup(self->source) < 0) {
- status = -1;
- }
- self->cb_cleanup = NULL;
- }
+ if (self->skipset != NULL) {
+ kh_destroy_int64((kh_int64_t *)self->skipset);
+ self->skipset = NULL;
+ }
- return status;
+ if (self->cb_cleanup != NULL) {
+ self->cb_cleanup(self->source);
+ self->cb_cleanup = NULL;
+ }
}
int parser_init(parser_t *self) {
- int64_t sz;
-
- /*
- Initialize data buffers
- */
-
- self->stream = NULL;
- self->words = NULL;
- self->word_starts = NULL;
- self->line_start = NULL;
- self->line_fields = NULL;
- self->error_msg = NULL;
- self->warn_msg = NULL;
-
- // token stream
- self->stream = malloc(STREAM_INIT_SIZE * sizeof(char));
- if (self->stream == NULL) {
- parser_cleanup(self);
- return PARSER_OUT_OF_MEMORY;
- }
- self->stream_cap = STREAM_INIT_SIZE;
- self->stream_len = 0;
-
- // word pointers and metadata
- sz = STREAM_INIT_SIZE / 10;
- sz = sz ? sz : 1;
- self->words = malloc(sz * sizeof(char *));
- self->word_starts = malloc(sz * sizeof(int64_t));
- self->max_words_cap = sz;
- self->words_cap = sz;
- self->words_len = 0;
-
- // line pointers and metadata
- self->line_start = malloc(sz * sizeof(int64_t));
-
- self->line_fields = malloc(sz * sizeof(int64_t));
-
- self->lines_cap = sz;
- self->lines = 0;
- self->file_lines = 0;
-
- if (self->stream == NULL || self->words == NULL ||
- self->word_starts == NULL || self->line_start == NULL ||
- self->line_fields == NULL) {
- parser_cleanup(self);
+ /*
+ Initialize data buffers
+ */
+
+ self->stream = NULL;
+ self->words = NULL;
+ self->word_starts = NULL;
+ self->line_start = NULL;
+ self->line_fields = NULL;
+ self->error_msg = NULL;
+ self->warn_msg = NULL;
+
+ // token stream
+ self->stream = malloc(STREAM_INIT_SIZE * sizeof(char));
+ if (self->stream == NULL) {
+ parser_cleanup(self);
+ return PARSER_OUT_OF_MEMORY;
+ }
+ self->stream_cap = STREAM_INIT_SIZE;
+ self->stream_len = 0;
+
+ // word pointers and metadata
+ _Static_assert(STREAM_INIT_SIZE / 10 > 0,
+ "STREAM_INIT_SIZE must be defined and >= 10");
+ const int64_t sz = STREAM_INIT_SIZE / 10;
+ self->words = malloc(sz * sizeof(char *));
+ self->word_starts = malloc(sz * sizeof(int64_t));
+ self->max_words_cap = sz;
+ self->words_cap = sz;
+ self->words_len = 0;
+
+ // line pointers and metadata
+ self->line_start = malloc(sz * sizeof(int64_t));
+
+ self->line_fields = malloc(sz * sizeof(int64_t));
+
+ self->lines_cap = sz;
+ self->lines = 0;
+ self->file_lines = 0;
+
+ if (self->stream == NULL || self->words == NULL ||
+ self->word_starts == NULL || self->line_start == NULL ||
+ self->line_fields == NULL) {
+ parser_cleanup(self);
- return PARSER_OUT_OF_MEMORY;
- }
+ return PARSER_OUT_OF_MEMORY;
+ }
- /* amount of bytes buffered */
- self->datalen = 0;
- self->datapos = 0;
+ /* amount of bytes buffered */
+ self->datalen = 0;
+ self->datapos = 0;
- self->line_start[0] = 0;
- self->line_fields[0] = 0;
+ self->line_start[0] = 0;
+ self->line_fields[0] = 0;
- self->pword_start = self->stream;
- self->word_start = 0;
+ self->pword_start = self->stream;
+ self->word_start = 0;
- self->state = START_RECORD;
+ self->state = START_RECORD;
- self->error_msg = NULL;
- self->warn_msg = NULL;
+ self->error_msg = NULL;
+ self->warn_msg = NULL;
- self->commentchar = '\0';
+ self->commentchar = '\0';
- return 0;
+ return 0;
}
void parser_free(parser_t *self) {
- // opposite of parser_init
- parser_cleanup(self);
+ // opposite of parser_init
+ parser_cleanup(self);
}
-void parser_del(parser_t *self) {
- free(self);
-}
+void parser_del(parser_t *self) { free(self); }
static int make_stream_space(parser_t *self, size_t nbytes) {
- uint64_t i, cap, length;
- int status;
- void *orig_ptr, *newptr;
-
- // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
+ // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
- /*
- TOKEN STREAM
- */
+ /*
+ TOKEN STREAM
+ */
- orig_ptr = (void *)self->stream;
- TRACE(
- ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n",
+ int status;
+ char *orig_ptr = (void *)self->stream;
+ TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n",
nbytes))
- self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len,
- &self->stream_cap, nbytes * 2,
- sizeof(char), &status);
- TRACE(
- ("make_stream_space: self->stream=%p, self->stream_len = %zu, "
+ self->stream =
+ (char *)grow_buffer((void *)self->stream, self->stream_len,
+ &self->stream_cap, nbytes * 2, sizeof(char), &status);
+ TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, "
"self->stream_cap=%zu, status=%zu\n",
self->stream, self->stream_len, self->stream_cap, status))
- if (status != 0) {
- return PARSER_OUT_OF_MEMORY;
- }
-
- // realloc sets errno when moving buffer?
- if (self->stream != orig_ptr) {
- self->pword_start = self->stream + self->word_start;
-
- for (i = 0; i < self->words_len; ++i) {
- self->words[i] = self->stream + self->word_starts[i];
- }
- }
-
- /*
- WORD VECTORS
- */
-
- cap = self->words_cap;
-
- /**
- * If we are reading in chunks, we need to be aware of the maximum number
- * of words we have seen in previous chunks (self->max_words_cap), so
- * that way, we can properly allocate when reading subsequent ones.
- *
- * Otherwise, we risk a buffer overflow if we mistakenly under-allocate
- * just because a recent chunk did not have as many words.
- */
- if (self->words_len + nbytes < self->max_words_cap) {
- length = self->max_words_cap - nbytes - 1;
- } else {
- length = self->words_len;
- }
-
- self->words =
- (char **)grow_buffer((void *)self->words, length,
- &self->words_cap, nbytes,
- sizeof(char *), &status);
- TRACE(
- ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
+ if (status != 0) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ // realloc sets errno when moving buffer?
+ if (self->stream != orig_ptr) {
+ self->pword_start = self->stream + self->word_start;
+
+ for (uint64_t i = 0; i < self->words_len; ++i) {
+ self->words[i] = self->stream + self->word_starts[i];
+ }
+ }
+
+ /*
+ WORD VECTORS
+ */
+
+ const uint64_t words_cap = self->words_cap;
+
+ /**
+ * If we are reading in chunks, we need to be aware of the maximum number
+ * of words we have seen in previous chunks (self->max_words_cap), so
+ * that way, we can properly allocate when reading subsequent ones.
+ *
+ * Otherwise, we risk a buffer overflow if we mistakenly under-allocate
+ * just because a recent chunk did not have as many words.
+ */
+ const uint64_t length = self->words_len + nbytes < self->max_words_cap
+ ? self->max_words_cap - nbytes - 1
+ : self->words_len;
+
+ self->words =
+ (char **)grow_buffer((void *)self->words, length, &self->words_cap,
+ nbytes, sizeof(char *), &status);
+ TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
"%d)\n",
self->words_len, self->words_cap, nbytes, status))
- if (status != 0) {
- return PARSER_OUT_OF_MEMORY;
- }
-
- // realloc took place
- if (cap != self->words_cap) {
- TRACE(
- ("make_stream_space: cap != self->words_cap, nbytes = %d, "
- "self->words_cap=%d\n",
- nbytes, self->words_cap))
- newptr = realloc((void *)self->word_starts,
- sizeof(int64_t) * self->words_cap);
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->word_starts = (int64_t *)newptr;
- }
- }
-
- /*
- LINE VECTORS
- */
- cap = self->lines_cap;
- self->line_start =
- (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
- &self->lines_cap, nbytes,
- sizeof(int64_t), &status);
- TRACE((
- "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
- self->lines + 1, self->lines_cap, nbytes, status))
- if (status != 0) {
- return PARSER_OUT_OF_MEMORY;
- }
-
- // realloc took place
- if (cap != self->lines_cap) {
- TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n",
- nbytes))
- newptr = realloc((void *)self->line_fields,
- sizeof(int64_t) * self->lines_cap);
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->line_fields = (int64_t *)newptr;
- }
+ if (status != 0) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ // realloc took place
+ if (words_cap != self->words_cap) {
+ TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, "
+ "self->words_cap=%d\n",
+ nbytes, self->words_cap))
+ int64_t *newptr = (int64_t *)realloc(self->word_starts,
+ sizeof(int64_t) * self->words_cap);
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->word_starts = newptr;
+ }
+ }
+
+ /*
+ LINE VECTORS
+ */
+ const uint64_t lines_cap = self->lines_cap;
+ self->line_start = (int64_t *)grow_buffer((void *)self->line_start,
+ self->lines + 1, &self->lines_cap,
+ nbytes, sizeof(int64_t), &status);
+ TRACE(
+ ("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
+ self->lines + 1, self->lines_cap, nbytes, status))
+ if (status != 0) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ // realloc took place
+ if (lines_cap != self->lines_cap) {
+ TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes))
+ int64_t *newptr = (int64_t *)realloc(self->line_fields,
+ sizeof(int64_t) * self->lines_cap);
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->line_fields = newptr;
}
+ }
- return 0;
+ return 0;
}
static int push_char(parser_t *self, char c) {
- TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n",
- self->stream_len + 1, c, self->stream_cap))
- if (self->stream_len >= self->stream_cap) {
- TRACE(
- ("push_char: ERROR!!! self->stream_len(%d) >= "
- "self->stream_cap(%d)\n",
- self->stream_len, self->stream_cap))
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Buffer overflow caught - possible malformed input file.\n");
- return PARSER_OUT_OF_MEMORY;
- }
- self->stream[self->stream_len++] = c;
- return 0;
+ TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n",
+ self->stream_len + 1, c, self->stream_cap))
+ if (self->stream_len >= self->stream_cap) {
+ TRACE(("push_char: ERROR!!! self->stream_len(%d) >= "
+ "self->stream_cap(%d)\n",
+ self->stream_len, self->stream_cap))
+ const size_t bufsize = 100;
+ self->error_msg = malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Buffer overflow caught - possible malformed input file.\n");
+ return PARSER_OUT_OF_MEMORY;
+ }
+ self->stream[self->stream_len++] = c;
+ return 0;
}
-int PANDAS_INLINE end_field(parser_t *self) {
- // XXX cruft
- if (self->words_len >= self->words_cap) {
- TRACE(
- ("end_field: ERROR!!! self->words_len(%zu) >= "
- "self->words_cap(%zu)\n",
- self->words_len, self->words_cap))
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Buffer overflow caught - possible malformed input file.\n");
- return PARSER_OUT_OF_MEMORY;
- }
+static inline int end_field(parser_t *self) {
+ // XXX cruft
+ if (self->words_len >= self->words_cap) {
+ TRACE(("end_field: ERROR!!! self->words_len(%zu) >= "
+ "self->words_cap(%zu)\n",
+ self->words_len, self->words_cap))
+ const size_t bufsize = 100;
+ self->error_msg = malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Buffer overflow caught - possible malformed input file.\n");
+ return PARSER_OUT_OF_MEMORY;
+ }
- // null terminate token
- push_char(self, '\0');
+ // null terminate token
+ push_char(self, '\0');
- // set pointer and metadata
- self->words[self->words_len] = self->pword_start;
+ // set pointer and metadata
+ self->words[self->words_len] = self->pword_start;
- TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0]));
+ TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0]));
- TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start,
- self->word_start, self->words_len + 1))
+ TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start,
+ self->word_start, self->words_len + 1))
- self->word_starts[self->words_len] = self->word_start;
- self->words_len++;
+ self->word_starts[self->words_len] = self->word_start;
+ self->words_len++;
- // increment line field count
- self->line_fields[self->lines]++;
+ // increment line field count
+ self->line_fields[self->lines]++;
- // New field begin in stream
- self->pword_start = self->stream + self->stream_len;
- self->word_start = self->stream_len;
+ // New field begin in stream
+ self->pword_start = self->stream + self->stream_len;
+ self->word_start = self->stream_len;
- return 0;
+ return 0;
}
static void append_warning(parser_t *self, const char *msg) {
- int64_t ex_length;
- int64_t length = strlen(msg);
- void *newptr;
-
- if (self->warn_msg == NULL) {
- self->warn_msg = malloc(length + 1);
- snprintf(self->warn_msg, length + 1, "%s", msg);
- } else {
- ex_length = strlen(self->warn_msg);
- newptr = realloc(self->warn_msg, ex_length + length + 1);
- if (newptr != NULL) {
- self->warn_msg = (char *)newptr;
- snprintf(self->warn_msg + ex_length, length + 1, "%s", msg);
- }
- }
+ const int64_t length = strlen(msg);
+
+ if (self->warn_msg == NULL) {
+ self->warn_msg = malloc(length + 1);
+ snprintf(self->warn_msg, length + 1, "%s", msg);
+ } else {
+ const int64_t ex_length = strlen(self->warn_msg);
+ char *newptr = (char *)realloc(self->warn_msg, ex_length + length + 1);
+ if (newptr != NULL) {
+ self->warn_msg = newptr;
+ snprintf(self->warn_msg + ex_length, length + 1, "%s", msg);
+ }
+ }
}
static int end_line(parser_t *self) {
- char *msg;
- int64_t fields;
- int64_t ex_fields = self->expected_fields;
- int64_t bufsize = 100; // for error or warning messages
+ int64_t ex_fields = self->expected_fields;
+ int64_t fields = self->line_fields[self->lines];
- fields = self->line_fields[self->lines];
+ TRACE(("end_line: Line end, nfields: %d\n", fields));
- TRACE(("end_line: Line end, nfields: %d\n", fields));
-
- TRACE(("end_line: lines: %d\n", self->lines));
- if (self->lines > 0) {
- if (self->expected_fields >= 0) {
- ex_fields = self->expected_fields;
- } else {
- ex_fields = self->line_fields[self->lines - 1];
- }
+ TRACE(("end_line: lines: %d\n", self->lines));
+ if (self->lines > 0) {
+ if (self->expected_fields >= 0) {
+ ex_fields = self->expected_fields;
+ } else {
+ ex_fields = self->line_fields[self->lines - 1];
}
- TRACE(("end_line: ex_fields: %d\n", ex_fields));
-
- if (self->state == START_FIELD_IN_SKIP_LINE ||
- self->state == IN_FIELD_IN_SKIP_LINE ||
- self->state == IN_QUOTED_FIELD_IN_SKIP_LINE ||
- self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) {
- TRACE(("end_line: Skipping row %d\n", self->file_lines));
- // increment file line count
- self->file_lines++;
+ }
+ TRACE(("end_line: ex_fields: %d\n", ex_fields));
- // skip the tokens from this bad line
- self->line_start[self->lines] += fields;
+ if (self->state == START_FIELD_IN_SKIP_LINE ||
+ self->state == IN_FIELD_IN_SKIP_LINE ||
+ self->state == IN_QUOTED_FIELD_IN_SKIP_LINE ||
+ self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) {
+ TRACE(("end_line: Skipping row %d\n", self->file_lines));
+ // increment file line count
+ self->file_lines++;
- // reset field count
- self->line_fields[self->lines] = 0;
- return 0;
- }
+ // skip the tokens from this bad line
+ self->line_start[self->lines] += fields;
- if (!(self->lines <= self->header_end + 1) &&
- (fields > ex_fields) && !(self->usecols)) {
- // increment file line count
- self->file_lines++;
+ // reset field count
+ self->line_fields[self->lines] = 0;
+ return 0;
+ }
- // skip the tokens from this bad line
- self->line_start[self->lines] += fields;
+ if (!(self->lines <= self->header_end + 1) && (fields > ex_fields) &&
+ !(self->usecols)) {
+ // increment file line count
+ self->file_lines++;
- // reset field count
- self->line_fields[self->lines] = 0;
+ // skip the tokens from this bad line
+ self->line_start[self->lines] += fields;
- // file_lines is now the actual file line number (starting at 1)
- if (self->on_bad_lines == ERROR) {
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Expected %" PRId64 " fields in line %" PRIu64 ", saw %"
- PRId64 "\n", ex_fields, self->file_lines, fields);
+ // reset field count
+ self->line_fields[self->lines] = 0;
- TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
+ // file_lines is now the actual file line number (starting at 1)
+ if (self->on_bad_lines == ERROR) {
+ const size_t bufsize = 100;
+ self->error_msg = malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" PRId64
+ "\n",
+ ex_fields, self->file_lines, fields);
- return -1;
- } else {
- // simply skip bad lines
- if (self->on_bad_lines == WARN) {
- // pass up error message
- msg = malloc(bufsize);
- snprintf(msg, bufsize,
- "Skipping line %" PRIu64 ": expected %" PRId64
- " fields, saw %" PRId64 "\n",
- self->file_lines, ex_fields, fields);
- append_warning(self, msg);
- free(msg);
- }
- }
+ TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
+
+ return -1;
} else {
- // missing trailing delimiters
- if ((self->lines >= self->header_end + 1) &&
- fields < ex_fields) {
- // might overrun the buffer when closing fields
- if (make_stream_space(self, ex_fields - fields) < 0) {
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize, "out of memory");
- return -1;
- }
-
- while (fields < ex_fields) {
- end_field(self);
- fields++;
- }
- }
+ // simply skip bad lines
+ if (self->on_bad_lines == WARN) {
+ // pass up error message
+ const size_t bufsize = 100;
+ char *msg = (char *)malloc(bufsize);
+ snprintf(msg, bufsize,
+ "Skipping line %" PRIu64 ": expected %" PRId64
+ " fields, saw %" PRId64 "\n",
+ self->file_lines, ex_fields, fields);
+ append_warning(self, msg);
+ free(msg);
+ }
+ }
+ } else {
+ // missing trailing delimiters
+ if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
+ // might overrun the buffer when closing fields
+ if (make_stream_space(self, ex_fields - fields) < 0) {
+ const size_t bufsize = 100;
+ self->error_msg = malloc(bufsize);
+ snprintf(self->error_msg, bufsize, "out of memory");
+ return -1;
+ }
- // increment both line counts
- self->file_lines++;
- self->lines++;
-
- // good line, set new start point
- if (self->lines >= self->lines_cap) {
- TRACE((
- "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
- self->lines, self->lines_cap))
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Buffer overflow caught - "
- "possible malformed input file.\n");
- return PARSER_OUT_OF_MEMORY;
- }
- self->line_start[self->lines] =
- (self->line_start[self->lines - 1] + fields);
+ while (fields < ex_fields) {
+ end_field(self);
+ fields++;
+ }
+ }
- TRACE(
- ("end_line: new line start: %d\n", self->line_start[self->lines]));
+ // increment both line counts
+ self->file_lines++;
+ self->lines++;
- // new line start with 0 fields
- self->line_fields[self->lines] = 0;
+ // good line, set new start point
+ if (self->lines >= self->lines_cap) {
+ TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
+ self->lines, self->lines_cap))
+ const size_t bufsize = 100;
+ self->error_msg = malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Buffer overflow caught - "
+ "possible malformed input file.\n");
+ return PARSER_OUT_OF_MEMORY;
}
+ self->line_start[self->lines] =
+ (self->line_start[self->lines - 1] + fields);
- TRACE(("end_line: Finished line, at %d\n", self->lines));
+ TRACE(("end_line: new line start: %d\n", self->line_start[self->lines]));
- return 0;
+ // new line start with 0 fields
+ self->line_fields[self->lines] = 0;
+ }
+
+ TRACE(("end_line: Finished line, at %d\n", self->lines));
+
+ return 0;
}
int parser_add_skiprow(parser_t *self, int64_t row) {
- khiter_t k;
- kh_int64_t *set;
- int ret = 0;
+ khiter_t k;
+ kh_int64_t *set;
+ int ret = 0;
- if (self->skipset == NULL) {
- self->skipset = (void *)kh_init_int64();
- }
+ if (self->skipset == NULL) {
+ self->skipset = (void *)kh_init_int64();
+ }
- set = (kh_int64_t *)self->skipset;
+ set = (kh_int64_t *)self->skipset;
- k = kh_put_int64(set, row, &ret);
- set->keys[k] = row;
+ k = kh_put_int64(set, row, &ret);
+ set->keys[k] = row;
- return 0;
+ return 0;
}
-int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
- // self->file_lines is zero based so subtract 1 from nrows
- if (nrows > 0) {
- self->skip_first_N_rows = nrows - 1;
- }
-
- return 0;
+void parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
+ // self->file_lines is zero based so subtract 1 from nrows
+ if (nrows > 0) {
+ self->skip_first_N_rows = nrows - 1;
+ }
}
static int parser_buffer_bytes(parser_t *self, size_t nbytes,
const char *encoding_errors) {
- int status;
- size_t bytes_read;
-
- status = 0;
- self->datapos = 0;
- self->data = self->cb_io(self->source, nbytes, &bytes_read, &status,
- encoding_errors);
- TRACE((
- "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
- nbytes, bytes_read, status));
- self->datalen = bytes_read;
-
- if (status != REACHED_EOF && self->data == NULL) {
- int64_t bufsize = 200;
- self->error_msg = malloc(bufsize);
-
- if (status == CALLING_READ_FAILED) {
- snprintf(self->error_msg, bufsize,
- "Calling read(nbytes) on source failed. "
- "Try engine='python'.");
- } else {
- snprintf(self->error_msg, bufsize, "Unknown error in IO callback");
- }
- return -1;
+ int status;
+ size_t bytes_read;
+
+ status = 0;
+ self->datapos = 0;
+ self->data =
+ self->cb_io(self->source, nbytes, &bytes_read, &status, encoding_errors);
+ TRACE(
+ ("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
+ nbytes, bytes_read, status));
+ self->datalen = bytes_read;
+
+ if (status != REACHED_EOF && self->data == NULL) {
+ const size_t bufsize = 200;
+ self->error_msg = malloc(bufsize);
+
+ if (status == CALLING_READ_FAILED) {
+ snprintf(self->error_msg, bufsize,
+ "Calling read(nbytes) on source failed. "
+ "Try engine='python'.");
+ } else {
+ snprintf(self->error_msg, bufsize, "Unknown error in IO callback");
}
+ return -1;
+ }
- TRACE(("datalen: %d\n", self->datalen));
+ TRACE(("datalen: %d\n", self->datalen));
- return status;
+ return status;
}
/*
@@ -593,63 +551,61 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
*/
-#define PUSH_CHAR(c) \
- TRACE( \
- ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
- c, slen, self->stream_cap, self->stream_len)) \
- if (slen >= self->stream_cap) { \
- TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
- self->stream_cap)) \
- int64_t bufsize = 100; \
- self->error_msg = malloc(bufsize); \
- snprintf(self->error_msg, bufsize, \
- "Buffer overflow caught - possible malformed input file.\n");\
- return PARSER_OUT_OF_MEMORY; \
- } \
- *stream++ = c; \
- slen++;
+#define PUSH_CHAR(c) \
+ TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
+ c, slen, self->stream_cap, self->stream_len)) \
+ if (slen >= self->stream_cap) { \
+ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
+ self->stream_cap)) \
+ const size_t bufsize = 100; \
+ self->error_msg = malloc(bufsize); \
+ snprintf(self->error_msg, bufsize, \
+ "Buffer overflow caught - possible malformed input file.\n"); \
+ return PARSER_OUT_OF_MEMORY; \
+ } \
+ *stream++ = c; \
+ slen++;
// This is a little bit of a hack but works for now
-#define END_FIELD() \
- self->stream_len = slen; \
- if (end_field(self) < 0) { \
- goto parsingerror; \
- } \
- stream = self->stream + self->stream_len; \
- slen = self->stream_len;
-
-#define END_LINE_STATE(STATE) \
- self->stream_len = slen; \
- if (end_line(self) < 0) { \
- goto parsingerror; \
- } \
- stream = self->stream + self->stream_len; \
- slen = self->stream_len; \
- self->state = STATE; \
- if (line_limit > 0 && self->lines == start_lines + line_limit) { \
- goto linelimit; \
- }
-
-#define END_LINE_AND_FIELD_STATE(STATE) \
- self->stream_len = slen; \
- if (end_line(self) < 0) { \
- goto parsingerror; \
- } \
- if (end_field(self) < 0) { \
- goto parsingerror; \
- } \
- stream = self->stream + self->stream_len; \
- slen = self->stream_len; \
- self->state = STATE; \
- if (line_limit > 0 && self->lines == start_lines + line_limit) { \
- goto linelimit; \
- }
+#define END_FIELD() \
+ self->stream_len = slen; \
+ if (end_field(self) < 0) { \
+ goto parsingerror; \
+ } \
+ stream = self->stream + self->stream_len; \
+ slen = self->stream_len;
+
+#define END_LINE_STATE(STATE) \
+ self->stream_len = slen; \
+ if (end_line(self) < 0) { \
+ goto parsingerror; \
+ } \
+ stream = self->stream + self->stream_len; \
+ slen = self->stream_len; \
+ self->state = STATE; \
+ if (line_limit > 0 && self->lines == start_lines + line_limit) { \
+ goto linelimit; \
+ }
+
+#define END_LINE_AND_FIELD_STATE(STATE) \
+ self->stream_len = slen; \
+ if (end_line(self) < 0) { \
+ goto parsingerror; \
+ } \
+ if (end_field(self) < 0) { \
+ goto parsingerror; \
+ } \
+ stream = self->stream + self->stream_len; \
+ slen = self->stream_len; \
+ self->state = STATE; \
+ if (line_limit > 0 && self->lines == start_lines + line_limit) { \
+ goto linelimit; \
+ }
#define END_LINE() END_LINE_STATE(START_RECORD)
-#define IS_TERMINATOR(c) \
- (c == lineterminator)
+#define IS_TERMINATOR(c) (c == lineterminator)
#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
@@ -660,677 +616,655 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
#define IS_ESCAPE_CHAR(c) (c == escape_symbol)
-#define IS_SKIPPABLE_SPACE(c) \
- ((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
+#define IS_SKIPPABLE_SPACE(c) \
+ ((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
// applied when in a field
-#define IS_DELIMITER(c) ((c == delimiter) || (delim_whitespace && isblank(c)))
-
-#define _TOKEN_CLEANUP() \
- self->stream_len = slen; \
- self->datapos = i; \
- TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \
- self->datalen));
-
-#define CHECK_FOR_BOM() \
- if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
- buf += 3; \
- self->datapos += 3; \
- }
+#define IS_DELIMITER(c) \
+ ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c)))
+
+#define _TOKEN_CLEANUP() \
+ self->stream_len = slen; \
+ self->datapos = i; \
+ TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \
+ self->datalen));
+
+#define CHECK_FOR_BOM() \
+ if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
+ buf += 3; \
+ self->datapos += 3; \
+ }
+
+static int skip_this_line(parser_t *self, int64_t rownum) {
+ if (self->skipfunc != NULL) {
+ PyGILState_STATE state = PyGILState_Ensure();
+ PyObject *result = PyObject_CallFunction(self->skipfunc, "i", rownum);
+
+ // Error occurred. It will be processed
+ // and caught at the Cython level.
+ const int should_skip = result == NULL ? -1 : PyObject_IsTrue(result);
+
+ Py_XDECREF(result);
+ PyGILState_Release(state);
+
+ return should_skip;
+ } else if (self->skipset != NULL) {
+ return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
+ ((kh_int64_t *)self->skipset)->n_buckets);
+ } else {
+ return (rownum <= self->skip_first_N_rows);
+ }
+}
-int skip_this_line(parser_t *self, int64_t rownum) {
- int should_skip;
- PyObject *result;
- PyGILState_STATE state;
+static int tokenize_bytes(parser_t *self, size_t line_limit,
+ uint64_t start_lines) {
+ char *buf = self->data + self->datapos;
- if (self->skipfunc != NULL) {
- state = PyGILState_Ensure();
- result = PyObject_CallFunction(self->skipfunc, "i", rownum);
+ const char lineterminator =
+ (self->lineterminator == '\0') ? '\n' : self->lineterminator;
- // Error occurred. It will be processed
- // and caught at the Cython level.
- if (result == NULL) {
- should_skip = -1;
- } else {
- should_skip = PyObject_IsTrue(result);
- }
+ const int delim_whitespace = self->delim_whitespace;
+ const char delimiter = self->delimiter;
- Py_XDECREF(result);
- PyGILState_Release(state);
+ // 1000 is something that couldn't fit in "char"
+ // thus comparing a char to it would always be "false"
+ const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
+ const int comment_symbol =
+ (self->commentchar != '\0') ? self->commentchar : 1000;
+ const int escape_symbol =
+ (self->escapechar != '\0') ? self->escapechar : 1000;
- return should_skip;
- } else if (self->skipset != NULL) {
- return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
- ((kh_int64_t *)self->skipset)->n_buckets);
- } else {
- return (rownum <= self->skip_first_N_rows);
- }
-}
+ if (make_stream_space(self, self->datalen - self->datapos) < 0) {
+ const size_t bufsize = 100;
+ self->error_msg = malloc(bufsize);
+ snprintf(self->error_msg, bufsize, "out of memory");
+ return -1;
+ }
-int tokenize_bytes(parser_t *self,
- size_t line_limit, uint64_t start_lines) {
- int64_t i;
- uint64_t slen;
- int should_skip;
- char c;
- char *stream;
- char *buf = self->data + self->datapos;
-
- const char lineterminator = (self->lineterminator == '\0') ?
- '\n' : self->lineterminator;
-
- const int delim_whitespace = self->delim_whitespace;
- const char delimiter = self->delimiter;
-
- // 1000 is something that couldn't fit in "char"
- // thus comparing a char to it would always be "false"
- const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
- const int comment_symbol = (self->commentchar != '\0') ?
- self->commentchar : 1000;
- const int escape_symbol = (self->escapechar != '\0') ?
- self->escapechar : 1000;
-
- if (make_stream_space(self, self->datalen - self->datapos) < 0) {
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize, "out of memory");
- return -1;
- }
+ char *stream = self->stream + self->stream_len;
+ uint64_t slen = self->stream_len;
- stream = self->stream + self->stream_len;
- slen = self->stream_len;
+ TRACE(("%s\n", buf));
- TRACE(("%s\n", buf));
+ if (self->file_lines == 0) {
+ CHECK_FOR_BOM();
+ }
- if (self->file_lines == 0) {
- CHECK_FOR_BOM();
- }
+ char c;
+ int64_t i;
+ for (i = self->datapos; i < self->datalen; ++i) {
+ // next character in file
+ c = *buf++;
+
+ TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, "
+ "state %d\n",
+ i, c, self->file_lines + 1, self->line_fields[self->lines],
+ self->state));
+
+ switch (self->state) {
+ case START_FIELD_IN_SKIP_LINE:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else if (IS_QUOTE(c)) {
+ self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else if (IS_DELIMITER(c)) {
+ // Do nothing, we're starting a new field again.
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
+ }
+ break;
+
+ case IN_FIELD_IN_SKIP_LINE:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else if (IS_DELIMITER(c)) {
+ self->state = START_FIELD_IN_SKIP_LINE;
+ }
+ break;
+
+ case IN_QUOTED_FIELD_IN_SKIP_LINE:
+ if (IS_QUOTE(c)) {
+ if (self->doublequote) {
+ self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
+ }
+ }
+ break;
+
+ case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
+ if (IS_QUOTE(c)) {
+ self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else if (IS_DELIMITER(c)) {
+ self->state = START_FIELD_IN_SKIP_LINE;
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
+ }
+ break;
+
+ case WHITESPACE_LINE:
+ if (IS_TERMINATOR(c)) {
+ self->file_lines++;
+ self->state = START_RECORD;
+ break;
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ break;
+ } else if (!self->delim_whitespace) {
+ if (isblank(c) && c != self->delimiter) {
+ } else { // backtrack
+ // use i + 1 because buf has been incremented but not i
+ do {
+ --buf;
+ --i;
+ } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf));
+
+ // reached a newline rather than the beginning
+ if (IS_TERMINATOR(*buf)) {
+ ++buf; // move pointer to first char after newline
+ ++i;
+ }
+ self->state = START_FIELD;
+ }
+ break;
+ }
+ // fall through
+
+ case EAT_WHITESPACE:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ self->state = START_RECORD;
+ break;
+ } else if (IS_CARRIAGE(c)) {
+ self->state = EAT_CRNL;
+ break;
+ } else if (IS_COMMENT_CHAR(c)) {
+ self->state = EAT_COMMENT;
+ break;
+ } else if (!isblank(c)) {
+ self->state = START_FIELD;
+ PD_FALLTHROUGH; // fall through to subsequent state
+ } else {
+ // if whitespace char, keep slurping
+ break;
+ }
+
+ case START_RECORD: {
+ // start of record
+ const int should_skip = skip_this_line(self, self->file_lines);
+
+ if (should_skip == -1) {
+ goto parsingerror;
+ } else if (should_skip) {
+ if (IS_QUOTE(c)) {
+ self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
- for (i = self->datapos; i < self->datalen; ++i) {
- // next character in file
- c = *buf++;
-
- TRACE(
- ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, "
- "state %d\n",
- i, c, self->file_lines + 1, self->line_fields[self->lines],
- self->state));
-
- switch (self->state) {
- case START_FIELD_IN_SKIP_LINE:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else if (IS_QUOTE(c)) {
- self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else if (IS_DELIMITER(c)) {
- // Do nothing, we're starting a new field again.
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
- }
- break;
-
- case IN_FIELD_IN_SKIP_LINE:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else if (IS_DELIMITER(c)) {
- self->state = START_FIELD_IN_SKIP_LINE;
- }
- break;
-
- case IN_QUOTED_FIELD_IN_SKIP_LINE:
- if (IS_QUOTE(c)) {
- if (self->doublequote) {
- self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
- }
- }
- break;
-
- case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
- if (IS_QUOTE(c)) {
- self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else if (IS_DELIMITER(c)) {
- self->state = START_FIELD_IN_SKIP_LINE;
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
- }
- break;
-
- case WHITESPACE_LINE:
- if (IS_TERMINATOR(c)) {
- self->file_lines++;
- self->state = START_RECORD;
- break;
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- break;
- } else if (!self->delim_whitespace) {
- if (isblank(c) && c != self->delimiter) {
- } else { // backtrack
- // use i + 1 because buf has been incremented but not i
- do {
- --buf;
- --i;
- } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf));
-
- // reached a newline rather than the beginning
- if (IS_TERMINATOR(*buf)) {
- ++buf; // move pointer to first char after newline
- ++i;
- }
- self->state = START_FIELD;
- }
- break;
- }
- // fall through
-
- case EAT_WHITESPACE:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- self->state = START_RECORD;
- break;
- } else if (IS_CARRIAGE(c)) {
- self->state = EAT_CRNL;
- break;
- } else if (IS_COMMENT_CHAR(c)) {
- self->state = EAT_COMMENT;
- break;
- } else if (!isblank(c)) {
- self->state = START_FIELD;
- // fall through to subsequent state
- } else {
- // if whitespace char, keep slurping
- break;
- }
-
- case START_RECORD:
- // start of record
- should_skip = skip_this_line(self, self->file_lines);
-
- if (should_skip == -1) {
- goto parsingerror;
- } else if (should_skip) {
- if (IS_QUOTE(c)) {
- self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
-
- if (IS_TERMINATOR(c)) {
- END_LINE();
- }
- }
- break;
- } else if (IS_TERMINATOR(c)) {
- // \n\r possible?
- if (self->skip_empty_lines) {
- self->file_lines++;
- } else {
- END_LINE();
- }
- break;
- } else if (IS_CARRIAGE(c)) {
- if (self->skip_empty_lines) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else {
- self->state = EAT_CRNL;
- }
- break;
- } else if (IS_COMMENT_CHAR(c)) {
- self->state = EAT_LINE_COMMENT;
- break;
- } else if (isblank(c)) {
- if (self->delim_whitespace) {
- if (self->skip_empty_lines) {
- self->state = WHITESPACE_LINE;
- } else {
- self->state = EAT_WHITESPACE;
- }
- break;
- } else if (c != self->delimiter && self->skip_empty_lines) {
- self->state = WHITESPACE_LINE;
- break;
- }
- // fall through
- }
-
- // normal character - fall through
- // to handle as START_FIELD
- self->state = START_FIELD;
-
- case START_FIELD:
- // expecting field
- if (IS_TERMINATOR(c)) {
- END_FIELD();
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- END_FIELD();
- self->state = EAT_CRNL;
- } else if (IS_QUOTE(c)) {
- // start quoted field
- self->state = IN_QUOTED_FIELD;
- } else if (IS_ESCAPE_CHAR(c)) {
- // possible escaped character
- self->state = ESCAPED_CHAR;
- } else if (IS_SKIPPABLE_SPACE(c)) {
- // ignore space at start of field
- } else if (IS_DELIMITER(c)) {
- if (self->delim_whitespace) {
- self->state = EAT_WHITESPACE;
- } else {
- // save empty field
- END_FIELD();
- }
- } else if (IS_COMMENT_CHAR(c)) {
- END_FIELD();
- self->state = EAT_COMMENT;
- } else {
- // begin new unquoted field
- PUSH_CHAR(c);
- self->state = IN_FIELD;
- }
- break;
-
- case ESCAPED_CHAR:
- PUSH_CHAR(c);
- self->state = IN_FIELD;
- break;
-
- case EAT_LINE_COMMENT:
- if (IS_TERMINATOR(c)) {
- self->file_lines++;
- self->state = START_RECORD;
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- }
- break;
-
- case IN_FIELD:
- // in unquoted field
- if (IS_TERMINATOR(c)) {
- END_FIELD();
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- END_FIELD();
- self->state = EAT_CRNL;
- } else if (IS_ESCAPE_CHAR(c)) {
- // possible escaped character
- self->state = ESCAPED_CHAR;
- } else if (IS_DELIMITER(c)) {
- // end of field - end of line not reached yet
- END_FIELD();
-
- if (self->delim_whitespace) {
- self->state = EAT_WHITESPACE;
- } else {
- self->state = START_FIELD;
- }
- } else if (IS_COMMENT_CHAR(c)) {
- END_FIELD();
- self->state = EAT_COMMENT;
- } else {
- // normal character - save in field
- PUSH_CHAR(c);
- }
- break;
-
- case IN_QUOTED_FIELD:
- // in quoted field
- if (IS_ESCAPE_CHAR(c)) {
- // possible escape character
- self->state = ESCAPE_IN_QUOTED_FIELD;
- } else if (IS_QUOTE(c)) {
- if (self->doublequote) {
- // double quote - " represented by ""
- self->state = QUOTE_IN_QUOTED_FIELD;
- } else {
- // end of quote part of field
- self->state = IN_FIELD;
- }
- } else {
- // normal character - save in field
- PUSH_CHAR(c);
- }
- break;
-
- case ESCAPE_IN_QUOTED_FIELD:
- PUSH_CHAR(c);
- self->state = IN_QUOTED_FIELD;
- break;
-
- case QUOTE_IN_QUOTED_FIELD:
- // double quote - seen a quote in an quoted field
- if (IS_QUOTE(c)) {
- // save "" as "
-
- PUSH_CHAR(c);
- self->state = IN_QUOTED_FIELD;
- } else if (IS_DELIMITER(c)) {
- // end of field - end of line not reached yet
- END_FIELD();
-
- if (self->delim_whitespace) {
- self->state = EAT_WHITESPACE;
- } else {
- self->state = START_FIELD;
- }
- } else if (IS_TERMINATOR(c)) {
- END_FIELD();
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- END_FIELD();
- self->state = EAT_CRNL;
- } else {
- PUSH_CHAR(c);
- self->state = IN_FIELD;
- }
- break;
-
- case EAT_COMMENT:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->state = EAT_CRNL;
- }
- break;
-
- // only occurs with non-custom line terminator,
- // which is why we directly check for '\n'
- case EAT_CRNL:
- if (c == '\n') {
- END_LINE();
- } else if (IS_DELIMITER(c)) {
- if (self->delim_whitespace) {
- END_LINE_STATE(EAT_WHITESPACE);
- } else {
- // Handle \r-delimited files
- END_LINE_AND_FIELD_STATE(START_FIELD);
- }
- } else {
- if (self->delim_whitespace) {
- /* XXX
- * first character of a new record--need to back up and
- * reread
- * to handle properly...
- */
- i--;
- buf--; // back up one character (HACK!)
- END_LINE_STATE(START_RECORD);
- } else {
- // \r line terminator
- // UGH. we don't actually want
- // to consume the token. fix this later
- self->stream_len = slen;
- if (end_line(self) < 0) {
- goto parsingerror;
- }
-
- stream = self->stream + self->stream_len;
- slen = self->stream_len;
- self->state = START_RECORD;
-
- --i;
- buf--; // let's try this character again (HACK!)
- if (line_limit > 0 &&
- self->lines == start_lines + line_limit) {
- goto linelimit;
- }
- }
- }
- break;
-
- // only occurs with non-custom line terminator,
- // which is why we directly check for '\n'
- case EAT_CRNL_NOP: // inside an ignored comment line
- self->state = START_RECORD;
- // \r line terminator -- parse this character again
- if (c != '\n' && !IS_DELIMITER(c)) {
- --i;
- --buf;
- }
- break;
- default:
- break;
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ }
+ }
+ break;
+ } else if (IS_TERMINATOR(c)) {
+ // \n\r possible?
+ if (self->skip_empty_lines) {
+ self->file_lines++;
+ } else {
+ END_LINE();
+ }
+ break;
+ } else if (IS_CARRIAGE(c)) {
+ if (self->skip_empty_lines) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else {
+ self->state = EAT_CRNL;
+ }
+ break;
+ } else if (IS_COMMENT_CHAR(c)) {
+ self->state = EAT_LINE_COMMENT;
+ break;
+ } else if (isblank(c)) {
+ if (self->delim_whitespace) {
+ if (self->skip_empty_lines) {
+ self->state = WHITESPACE_LINE;
+ } else {
+ self->state = EAT_WHITESPACE;
+ }
+ break;
+ } else if (c != self->delimiter && self->skip_empty_lines) {
+ self->state = WHITESPACE_LINE;
+ break;
+ }
+ }
+
+ // normal character - fall through
+ // to handle as START_FIELD
+ self->state = START_FIELD;
+ PD_FALLTHROUGH;
+ }
+ case START_FIELD:
+ // expecting field
+ if (IS_TERMINATOR(c)) {
+ END_FIELD();
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ END_FIELD();
+ self->state = EAT_CRNL;
+ } else if (IS_QUOTE(c)) {
+ // start quoted field
+ self->state = IN_QUOTED_FIELD;
+ } else if (IS_ESCAPE_CHAR(c)) {
+ // possible escaped character
+ self->state = ESCAPED_CHAR;
+ } else if (IS_SKIPPABLE_SPACE(c)) {
+ // ignore space at start of field
+ } else if (IS_DELIMITER(c)) {
+ if (self->delim_whitespace) {
+ self->state = EAT_WHITESPACE;
+ } else {
+ // save empty field
+ END_FIELD();
+ }
+ } else if (IS_COMMENT_CHAR(c)) {
+ END_FIELD();
+ self->state = EAT_COMMENT;
+ } else {
+ // begin new unquoted field
+ PUSH_CHAR(c);
+ self->state = IN_FIELD;
+ }
+ break;
+
+ case ESCAPED_CHAR:
+ PUSH_CHAR(c);
+ self->state = IN_FIELD;
+ break;
+
+ case EAT_LINE_COMMENT:
+ if (IS_TERMINATOR(c)) {
+ self->file_lines++;
+ self->state = START_RECORD;
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ }
+ break;
+
+ case IN_FIELD:
+ // in unquoted field
+ if (IS_TERMINATOR(c)) {
+ END_FIELD();
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ END_FIELD();
+ self->state = EAT_CRNL;
+ } else if (IS_ESCAPE_CHAR(c)) {
+ // possible escaped character
+ self->state = ESCAPED_CHAR;
+ } else if (IS_DELIMITER(c)) {
+ // end of field - end of line not reached yet
+ END_FIELD();
+
+ if (self->delim_whitespace) {
+ self->state = EAT_WHITESPACE;
+ } else {
+ self->state = START_FIELD;
+ }
+ } else if (IS_COMMENT_CHAR(c)) {
+ END_FIELD();
+ self->state = EAT_COMMENT;
+ } else {
+ // normal character - save in field
+ PUSH_CHAR(c);
+ }
+ break;
+
+ case IN_QUOTED_FIELD:
+ // in quoted field
+ if (IS_ESCAPE_CHAR(c)) {
+ // possible escape character
+ self->state = ESCAPE_IN_QUOTED_FIELD;
+ } else if (IS_QUOTE(c)) {
+ if (self->doublequote) {
+ // double quote - " represented by ""
+ self->state = QUOTE_IN_QUOTED_FIELD;
+ } else {
+ // end of quote part of field
+ self->state = IN_FIELD;
+ }
+ } else {
+ // normal character - save in field
+ PUSH_CHAR(c);
+ }
+ break;
+
+ case ESCAPE_IN_QUOTED_FIELD:
+ PUSH_CHAR(c);
+ self->state = IN_QUOTED_FIELD;
+ break;
+
+ case QUOTE_IN_QUOTED_FIELD:
+ // double quote - seen a quote in an quoted field
+ if (IS_QUOTE(c)) {
+ // save "" as "
+
+ PUSH_CHAR(c);
+ self->state = IN_QUOTED_FIELD;
+ } else if (IS_DELIMITER(c)) {
+ // end of field - end of line not reached yet
+ END_FIELD();
+
+ if (self->delim_whitespace) {
+ self->state = EAT_WHITESPACE;
+ } else {
+ self->state = START_FIELD;
+ }
+ } else if (IS_TERMINATOR(c)) {
+ END_FIELD();
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ END_FIELD();
+ self->state = EAT_CRNL;
+ } else {
+ PUSH_CHAR(c);
+ self->state = IN_FIELD;
+ }
+ break;
+
+ case EAT_COMMENT:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->state = EAT_CRNL;
+ }
+ break;
+
+ // only occurs with non-custom line terminator,
+ // which is why we directly check for '\n'
+ case EAT_CRNL:
+ if (c == '\n') {
+ END_LINE();
+ } else if (IS_DELIMITER(c)) {
+ if (self->delim_whitespace) {
+ END_LINE_STATE(EAT_WHITESPACE);
+ } else {
+ // Handle \r-delimited files
+ END_LINE_AND_FIELD_STATE(START_FIELD);
}
+ } else {
+ if (self->delim_whitespace) {
+ /* XXX
+ * first character of a new record--need to back up and
+ * reread
+ * to handle properly...
+ */
+ i--;
+ buf--; // back up one character (HACK!)
+ END_LINE_STATE(START_RECORD);
+ } else {
+ // \r line terminator
+ // UGH. we don't actually want
+ // to consume the token. fix this later
+ self->stream_len = slen;
+ if (end_line(self) < 0) {
+ goto parsingerror;
+ }
+
+ stream = self->stream + self->stream_len;
+ slen = self->stream_len;
+ self->state = START_RECORD;
+
+ --i;
+ buf--; // let's try this character again (HACK!)
+ if (line_limit > 0 && self->lines == start_lines + line_limit) {
+ goto linelimit;
+ }
+ }
+ }
+ break;
+
+ // only occurs with non-custom line terminator,
+ // which is why we directly check for '\n'
+ case EAT_CRNL_NOP: // inside an ignored comment line
+ self->state = START_RECORD;
+ // \r line terminator -- parse this character again
+ if (c != '\n' && !IS_DELIMITER(c)) {
+ --i;
+ --buf;
+ }
+ break;
+ default:
+ break;
}
+ }
- _TOKEN_CLEANUP();
+ _TOKEN_CLEANUP();
- TRACE(("Finished tokenizing input\n"))
+ TRACE(("Finished tokenizing input\n"))
- return 0;
+ return 0;
parsingerror:
- i++;
- _TOKEN_CLEANUP();
+ i++;
+ _TOKEN_CLEANUP();
- return -1;
+ return -1;
linelimit:
- i++;
- _TOKEN_CLEANUP();
+ i++;
+ _TOKEN_CLEANUP();
- return 0;
+ return 0;
}
static int parser_handle_eof(parser_t *self) {
- int64_t bufsize = 100;
-
- TRACE(
- ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
-
- if (self->datalen != 0) return -1;
-
- switch (self->state) {
- case START_RECORD:
- case WHITESPACE_LINE:
- case EAT_CRNL_NOP:
- case EAT_LINE_COMMENT:
- return 0;
-
- case ESCAPE_IN_QUOTED_FIELD:
- case IN_QUOTED_FIELD:
- self->error_msg = (char *)malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "EOF inside string starting at row %" PRIu64,
- self->file_lines);
- return -1;
-
- case ESCAPED_CHAR:
- self->error_msg = (char *)malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "EOF following escape character");
- return -1;
-
- case IN_FIELD:
- case START_FIELD:
- case QUOTE_IN_QUOTED_FIELD:
- if (end_field(self) < 0) return -1;
- break;
-
- default:
- break;
- }
+ const size_t bufsize = 100;
- if (end_line(self) < 0)
- return -1;
- else
- return 0;
-}
+ TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
-int parser_consume_rows(parser_t *self, size_t nrows) {
- int64_t offset, word_deletions;
- uint64_t char_count, i;
+ if (self->datalen != 0)
+ return -1;
- if (nrows > self->lines) {
- nrows = self->lines;
- }
+ switch (self->state) {
+ case START_RECORD:
+ case WHITESPACE_LINE:
+ case EAT_CRNL_NOP:
+ case EAT_LINE_COMMENT:
+ return 0;
- /* do nothing */
- if (nrows == 0) return 0;
+ case ESCAPE_IN_QUOTED_FIELD:
+ case IN_QUOTED_FIELD:
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "EOF inside string starting at row %" PRIu64, self->file_lines);
+ return -1;
- /* cannot guarantee that nrows + 1 has been observed */
- word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
- if (word_deletions >= 1) {
- char_count = (self->word_starts[word_deletions - 1] +
- strlen(self->words[word_deletions - 1]) + 1);
- } else {
- /* if word_deletions == 0 (i.e. this case) then char_count must
- * be 0 too, as no data needs to be skipped */
- char_count = 0;
- }
+ case ESCAPED_CHAR:
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize, "EOF following escape character");
+ return -1;
- TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
- char_count));
+ case IN_FIELD:
+ case START_FIELD:
+ case QUOTE_IN_QUOTED_FIELD:
+ if (end_field(self) < 0)
+ return -1;
+ break;
- /* move stream, only if something to move */
- if (char_count < self->stream_len) {
- memmove(self->stream, (self->stream + char_count),
- self->stream_len - char_count);
- }
- /* buffer counts */
- self->stream_len -= char_count;
+ default:
+ break;
+ }
- /* move token metadata */
- // Note: We should always have words_len < word_deletions, so this
- // subtraction will remain appropriately-typed.
- for (i = 0; i < self->words_len - word_deletions; ++i) {
- offset = i + word_deletions;
+ if (end_line(self) < 0)
+ return -1;
+ else
+ return 0;
+}
- self->words[i] = self->words[offset] - char_count;
- self->word_starts[i] = self->word_starts[offset] - char_count;
- }
- self->words_len -= word_deletions;
-
- /* move current word pointer to stream */
- self->pword_start -= char_count;
- self->word_start -= char_count;
-
- /* move line metadata */
- // Note: We should always have self->lines - nrows + 1 >= 0, so this
- // subtraction will remain appropriately-typed.
- for (i = 0; i < self->lines - nrows + 1; ++i) {
- offset = i + nrows;
- self->line_start[i] = self->line_start[offset] - word_deletions;
- self->line_fields[i] = self->line_fields[offset];
- }
- self->lines -= nrows;
+int parser_consume_rows(parser_t *self, size_t nrows) {
+ if (nrows > self->lines) {
+ nrows = self->lines;
+ }
+ /* do nothing */
+ if (nrows == 0)
return 0;
+
+ /* cannot guarantee that nrows + 1 has been observed */
+ const int64_t word_deletions =
+ self->line_start[nrows - 1] + self->line_fields[nrows - 1];
+
+ /* if word_deletions == 0 (i.e. this case) then char_count must
+ * be 0 too, as no data needs to be skipped */
+ const uint64_t char_count =
+ word_deletions >= 1 ? (self->word_starts[word_deletions - 1] +
+ strlen(self->words[word_deletions - 1]) + 1)
+ : 0;
+
+ TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
+ char_count));
+
+ /* move stream, only if something to move */
+ if (char_count < self->stream_len) {
+ memmove(self->stream, (self->stream + char_count),
+ self->stream_len - char_count);
+ }
+ /* buffer counts */
+ self->stream_len -= char_count;
+
+ /* move token metadata */
+ // Note: We should always have words_len < word_deletions, so this
+ // subtraction will remain appropriately-typed.
+ int64_t offset;
+ for (uint64_t i = 0; i < self->words_len - word_deletions; ++i) {
+ offset = i + word_deletions;
+
+ self->words[i] = self->words[offset] - char_count;
+ self->word_starts[i] = self->word_starts[offset] - char_count;
+ }
+ self->words_len -= word_deletions;
+
+ /* move current word pointer to stream */
+ self->pword_start -= char_count;
+ self->word_start -= char_count;
+
+ /* move line metadata */
+ // Note: We should always have self->lines - nrows + 1 >= 0, so this
+ // subtraction will remain appropriately-typed.
+ for (uint64_t i = 0; i < self->lines - nrows + 1; ++i) {
+ offset = i + nrows;
+ self->line_start[i] = self->line_start[offset] - word_deletions;
+ self->line_fields[i] = self->line_fields[offset];
+ }
+ self->lines -= nrows;
+
+ return 0;
}
static size_t _next_pow2(size_t sz) {
- size_t result = 1;
- while (result < sz) result *= 2;
- return result;
+ size_t result = 1;
+ while (result < sz)
+ result *= 2;
+ return result;
}
int parser_trim_buffers(parser_t *self) {
- /*
- Free memory
- */
- size_t new_cap;
- void *newptr;
-
- uint64_t i;
-
- /**
- * Before we free up space and trim, we should
- * save how many words we saw when parsing, if
- * it exceeds the maximum number we saw before.
- *
- * This is important for when we read in chunks,
- * so that we can inform subsequent chunk parsing
- * as to how many words we could possibly see.
- */
- if (self->words_cap > self->max_words_cap) {
- self->max_words_cap = self->words_cap;
- }
-
- /* trim words, word_starts */
- new_cap = _next_pow2(self->words_len) + 1;
- if (new_cap < self->words_cap) {
- TRACE(("parser_trim_buffers: new_cap < self->words_cap\n"));
- self->words = realloc(self->words, new_cap * sizeof(char *));
- if (self->words == NULL) {
- return PARSER_OUT_OF_MEMORY;
- }
- self->word_starts = realloc(self->word_starts,
- new_cap * sizeof(int64_t));
- if (self->word_starts == NULL) {
- return PARSER_OUT_OF_MEMORY;
- }
- self->words_cap = new_cap;
- }
-
- /* trim stream */
- new_cap = _next_pow2(self->stream_len) + 1;
- TRACE(
- ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
+ /*
+ Free memory
+ */
+
+ /**
+ * Before we free up space and trim, we should
+ * save how many words we saw when parsing, if
+ * it exceeds the maximum number we saw before.
+ *
+ * This is important for when we read in chunks,
+ * so that we can inform subsequent chunk parsing
+ * as to how many words we could possibly see.
+ */
+ if (self->words_cap > self->max_words_cap) {
+ self->max_words_cap = self->words_cap;
+ }
+
+ /* trim words, word_starts */
+ size_t new_cap = _next_pow2(self->words_len) + 1;
+ if (new_cap < self->words_cap) {
+ TRACE(("parser_trim_buffers: new_cap < self->words_cap\n"));
+ self->words = realloc(self->words, new_cap * sizeof(char *));
+ if (self->words == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+ self->word_starts = realloc(self->word_starts, new_cap * sizeof(int64_t));
+ if (self->word_starts == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+ self->words_cap = new_cap;
+ }
+
+ /* trim stream */
+ new_cap = _next_pow2(self->stream_len) + 1;
+ TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
"%zu\n",
new_cap, self->stream_cap, self->lines_cap));
- if (new_cap < self->stream_cap) {
- TRACE(
- ("parser_trim_buffers: new_cap < self->stream_cap, calling "
- "realloc\n"));
- newptr = realloc(self->stream, new_cap);
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- // Update the pointers in the self->words array (char **) if
- // `realloc`
- // moved the `self->stream` buffer. This block mirrors a similar
- // block in
- // `make_stream_space`.
- if (self->stream != newptr) {
- self->pword_start = (char *)newptr + self->word_start;
-
- for (i = 0; i < self->words_len; ++i) {
- self->words[i] = (char *)newptr + self->word_starts[i];
- }
- }
-
- self->stream = newptr;
- self->stream_cap = new_cap;
+ if (new_cap < self->stream_cap) {
+ TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling "
+ "realloc\n"));
+ void *newptr = realloc(self->stream, new_cap);
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ // Update the pointers in the self->words array (char **) if
+ // `realloc`
+ // moved the `self->stream` buffer. This block mirrors a similar
+ // block in
+ // `make_stream_space`.
+ if (self->stream != newptr) {
+ self->pword_start = (char *)newptr + self->word_start;
+
+ for (uint64_t i = 0; i < self->words_len; ++i) {
+ self->words[i] = (char *)newptr + self->word_starts[i];
}
+ }
+
+ self->stream = newptr;
+ self->stream_cap = new_cap;
}
+ }
- /* trim line_start, line_fields */
- new_cap = _next_pow2(self->lines) + 1;
- if (new_cap < self->lines_cap) {
- TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
- newptr = realloc(self->line_start,
- new_cap * sizeof(int64_t));
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->line_start = newptr;
- }
- newptr = realloc(self->line_fields,
- new_cap * sizeof(int64_t));
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->line_fields = newptr;
- self->lines_cap = new_cap;
- }
+ /* trim line_start, line_fields */
+ new_cap = _next_pow2(self->lines) + 1;
+ if (new_cap < self->lines_cap) {
+ TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
+ void *newptr = realloc(self->line_start, new_cap * sizeof(int64_t));
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->line_start = newptr;
+ }
+ newptr = realloc(self->line_fields, new_cap * sizeof(int64_t));
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->line_fields = newptr;
+ self->lines_cap = new_cap;
}
+ }
- return 0;
+ return 0;
}
/*
@@ -1338,65 +1272,61 @@ int parser_trim_buffers(parser_t *self) {
all : tokenize all the data vs. certain number of rows
*/
-int _tokenize_helper(parser_t *self, size_t nrows, int all,
- const char *encoding_errors) {
- int status = 0;
- uint64_t start_lines = self->lines;
-
- if (self->state == FINISHED) {
- return 0;
- }
-
- TRACE((
- "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n",
- nrows, self->datapos, self->datalen));
+static int _tokenize_helper(parser_t *self, size_t nrows, int all,
+ const char *encoding_errors) {
+ int status = 0;
+ const uint64_t start_lines = self->lines;
- while (1) {
- if (!all && self->lines - start_lines >= nrows) break;
-
- if (self->datapos == self->datalen) {
- status = parser_buffer_bytes(self, self->chunksize,
- encoding_errors);
-
- if (status == REACHED_EOF) {
- // close out last line
- status = parser_handle_eof(self);
- self->state = FINISHED;
- break;
- } else if (status != 0) {
- return status;
- }
- }
-
- TRACE(
- ("_tokenize_helper: Trying to process %d bytes, datalen=%d, "
- "datapos= %d\n",
- self->datalen - self->datapos, self->datalen, self->datapos));
-
- status = tokenize_bytes(self, nrows, start_lines);
-
- if (status < 0) {
- // XXX
- TRACE(
- ("_tokenize_helper: Status %d returned from tokenize_bytes, "
- "breaking\n",
- status));
- status = -1;
- break;
- }
- }
- TRACE(("leaving tokenize_helper\n"));
- return status;
+ if (self->state == FINISHED) {
+ return 0;
+ }
+
+ TRACE(
+ ("_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n",
+ nrows, self->datapos, self->datalen));
+
+ while (1) {
+ if (!all && self->lines - start_lines >= nrows)
+ break;
+
+ if (self->datapos == self->datalen) {
+ status = parser_buffer_bytes(self, self->chunksize, encoding_errors);
+
+ if (status == REACHED_EOF) {
+ // close out last line
+ status = parser_handle_eof(self);
+ self->state = FINISHED;
+ break;
+ } else if (status != 0) {
+ return status;
+ }
+ }
+
+ TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, "
+ "datapos= %d\n",
+ self->datalen - self->datapos, self->datalen, self->datapos));
+
+ status = tokenize_bytes(self, nrows, start_lines);
+
+ if (status < 0) {
+ // XXX
+ TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, "
+ "breaking\n",
+ status));
+ status = -1;
+ break;
+ }
+ }
+ TRACE(("leaving tokenize_helper\n"));
+ return status;
}
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) {
- int status = _tokenize_helper(self, nrows, 0, encoding_errors);
- return status;
+ return _tokenize_helper(self, nrows, 0, encoding_errors);
}
int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
- int status = _tokenize_helper(self, -1, 1, encoding_errors);
- return status;
+ return _tokenize_helper(self, -1, 1, encoding_errors);
}
/*
@@ -1414,15 +1344,15 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
* leaves the value of *val unmodified.
*/
int to_boolean(const char *item, uint8_t *val) {
- if (strcasecmp(item, "TRUE") == 0) {
- *val = 1;
- return 0;
- } else if (strcasecmp(item, "FALSE") == 0) {
- *val = 0;
- return 0;
- }
+ if (strcasecmp(item, "TRUE") == 0) {
+ *val = 1;
+ return 0;
+ } else if (strcasecmp(item, "FALSE") == 0) {
+ *val = 0;
+ return 0;
+ }
- return -1;
+ return -1;
}
// ---------------------------------------------------------------------------
@@ -1472,307 +1402,321 @@ int to_boolean(const char *item, uint8_t *val) {
// * Add tsep argument for thousands separator
//
-// pessimistic but quick assessment,
-// assuming that each decimal digit requires 4 bits to store
-const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4;
-
double xstrtod(const char *str, char **endptr, char decimal, char sci,
char tsep, int skip_trailing, int *error, int *maybe_int) {
- double number;
- unsigned int i_number = 0;
- int exponent;
- int negative;
- char *p = (char *)str;
- double p10;
- int n;
- int num_digits;
- int num_decimals;
-
- if (maybe_int != NULL) *maybe_int = 1;
- // Skip leading whitespace.
- while (isspace_ascii(*p)) p++;
-
- // Handle optional sign.
- negative = 0;
- switch (*p) {
- case '-':
- negative = 1; // Fall through to increment position.
- case '+':
- p++;
- }
-
- exponent = 0;
- num_digits = 0;
- num_decimals = 0;
-
- // Process string of digits.
- while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) {
- i_number = i_number * 10 + (*p - '0');
- p++;
- num_digits++;
-
- p += (tsep != '\0' && *p == tsep);
- }
- number = i_number;
-
- if (num_digits > max_int_decimal_digits) {
- // process what's left as double
- while (isdigit_ascii(*p)) {
- number = number * 10. + (*p - '0');
- p++;
- num_digits++;
-
- p += (tsep != '\0' && *p == tsep);
- }
- }
-
- // Process decimal part.
- if (*p == decimal) {
- if (maybe_int != NULL) *maybe_int = 0;
- p++;
-
- while (isdigit_ascii(*p)) {
- number = number * 10. + (*p - '0');
- p++;
- num_digits++;
- num_decimals++;
- }
-
- exponent -= num_decimals;
- }
-
- if (num_digits == 0) {
- *error = ERANGE;
- return 0.0;
- }
-
- // Correct for sign.
- if (negative) number = -number;
-
- // Process an exponent string.
- if (toupper_ascii(*p) == toupper_ascii(sci)) {
- if (maybe_int != NULL) *maybe_int = 0;
-
- // Handle optional sign.
- negative = 0;
- switch (*++p) {
- case '-':
- negative = 1; // Fall through to increment pos.
- case '+':
- p++;
- }
-
- // Process string of digits.
- num_digits = 0;
- n = 0;
- while (isdigit_ascii(*p)) {
- n = n * 10 + (*p - '0');
- num_digits++;
- p++;
- }
-
- if (negative)
- exponent -= n;
- else
- exponent += n;
+ const char *p = str;
+ if (maybe_int != NULL)
+ *maybe_int = 1;
+ // Skip leading whitespace.
+ while (isspace_ascii(*p))
+ p++;
+
+ // Handle optional sign.
+ int negative = 0;
+ switch (*p) {
+ case '-':
+ negative = 1;
+ PD_FALLTHROUGH; // Fall through to increment position.
+ case '+':
+ p++;
+ break;
+ }
+
+ int exponent = 0;
+ int num_digits = 0;
+ int num_decimals = 0;
+
+ // pessimistic but quick assessment,
+ // assuming that each decimal digit requires 4 bits to store
+ // TODO: C23 has UINT64_WIDTH macro that can be used at compile time
+ const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4;
+
+ // Process string of digits.
+ unsigned int i_number = 0;
+ while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) {
+ i_number = i_number * 10 + (*p - '0');
+ p++;
+ num_digits++;
+
+ p += (tsep != '\0' && *p == tsep);
+ }
+ double number = i_number;
+
+ if (num_digits > max_int_decimal_digits) {
+ // process what's left as double
+ while (isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
- // If no digits, after the 'e'/'E', un-consume it
- if (num_digits == 0) p--;
+ p += (tsep != '\0' && *p == tsep);
}
+ }
- if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) {
- *error = ERANGE;
- return HUGE_VAL;
- }
+ // Process decimal part.
+ if (*p == decimal) {
+ if (maybe_int != NULL)
+ *maybe_int = 0;
+ p++;
- // Scale the result.
- p10 = 10.;
- n = exponent;
- if (n < 0) n = -n;
- while (n) {
- if (n & 1) {
- if (exponent < 0)
- number /= p10;
- else
- number *= p10;
- }
- n >>= 1;
- p10 *= p10;
+ while (isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
+ num_decimals++;
}
- if (number == HUGE_VAL) {
- *error = ERANGE;
- }
+ exponent -= num_decimals;
+ }
- if (skip_trailing) {
- // Skip trailing whitespace.
- while (isspace_ascii(*p)) p++;
- }
+ if (num_digits == 0) {
+ *error = ERANGE;
+ return 0.0;
+ }
- if (endptr) *endptr = p;
- return number;
-}
+ // Correct for sign.
+ if (negative)
+ number = -number;
-double precise_xstrtod(const char *str, char **endptr, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) {
- double number;
- int exponent;
- int negative;
- char *p = (char *)str;
- int num_digits;
- int num_decimals;
- int max_digits = 17;
- int n;
-
- if (maybe_int != NULL) *maybe_int = 1;
- // Cache powers of 10 in memory.
- static double e[] = {
- 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
- 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
- 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29,
- 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39,
- 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49,
- 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59,
- 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
- 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79,
- 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89,
- 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99,
- 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109,
- 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119,
- 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129,
- 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139,
- 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149,
- 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
- 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169,
- 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179,
- 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189,
- 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199,
- 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209,
- 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219,
- 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229,
- 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239,
- 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
- 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259,
- 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269,
- 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279,
- 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289,
- 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299,
- 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308};
-
- // Skip leading whitespace.
- while (isspace_ascii(*p)) p++;
+ // Process an exponent string.
+ if (toupper_ascii(*p) == toupper_ascii(sci)) {
+ if (maybe_int != NULL)
+ *maybe_int = 0;
// Handle optional sign.
negative = 0;
- switch (*p) {
- case '-':
- negative = 1; // Fall through to increment position.
- case '+':
- p++;
+ switch (*++p) {
+ case '-':
+ negative = 1;
+ PD_FALLTHROUGH; // Fall through to increment position.
+ case '+':
+ p++;
+ break;
}
- number = 0.;
- exponent = 0;
- num_digits = 0;
- num_decimals = 0;
-
// Process string of digits.
+ num_digits = 0;
+ int n = 0;
while (isdigit_ascii(*p)) {
- if (num_digits < max_digits) {
- number = number * 10. + (*p - '0');
- num_digits++;
- } else {
- ++exponent;
- }
-
- p++;
- p += (tsep != '\0' && *p == tsep);
+ n = n * 10 + (*p - '0');
+ num_digits++;
+ p++;
}
- // Process decimal part
- if (*p == decimal) {
- if (maybe_int != NULL) *maybe_int = 0;
- p++;
+ if (negative)
+ exponent -= n;
+ else
+ exponent += n;
+
+ // If no digits, after the 'e'/'E', un-consume it
+ if (num_digits == 0)
+ p--;
+ }
+
+ if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) {
+ *error = ERANGE;
+ return HUGE_VAL;
+ }
+
+ // Scale the result.
+ double p10 = 10.;
+ int n = exponent;
+ if (n < 0)
+ n = -n;
+ while (n) {
+ if (n & 1) {
+ if (exponent < 0)
+ number /= p10;
+ else
+ number *= p10;
+ }
+ n >>= 1;
+ p10 *= p10;
+ }
+
+ if (number == HUGE_VAL) {
+ *error = ERANGE;
+ }
+
+ if (skip_trailing) {
+ // Skip trailing whitespace.
+ while (isspace_ascii(*p))
+ p++;
+ }
+
+ if (endptr)
+ *endptr = (char *)p;
+ return number;
+}
- while (num_digits < max_digits && isdigit_ascii(*p)) {
- number = number * 10. + (*p - '0');
- p++;
- num_digits++;
- num_decimals++;
- }
+double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
+ char tsep, int skip_trailing, int *error,
+ int *maybe_int) {
+ const char *p = str;
+ const int max_digits = 17;
+
+ if (maybe_int != NULL)
+ *maybe_int = 1;
+ // Cache powers of 10 in memory.
+ static double e[] = {
+ 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
+ 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+ 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29,
+ 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39,
+ 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49,
+ 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59,
+ 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
+ 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79,
+ 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89,
+ 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99,
+ 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109,
+ 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119,
+ 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129,
+ 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139,
+ 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149,
+ 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
+ 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169,
+ 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179,
+ 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189,
+ 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199,
+ 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209,
+ 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219,
+ 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229,
+ 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239,
+ 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
+ 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259,
+ 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269,
+ 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279,
+ 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289,
+ 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299,
+ 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308};
+
+ // Skip leading whitespace.
+ while (isspace_ascii(*p))
+ p++;
+
+ // Handle optional sign.
+ int negative = 0;
+ switch (*p) {
+ case '-':
+ negative = 1;
+ PD_FALLTHROUGH; // Fall through to increment position.
+ case '+':
+ p++;
+ break;
+ }
+
+ double number = 0.;
+ int exponent = 0;
+ int num_digits = 0;
+ int num_decimals = 0;
+
+ // Process string of digits.
+ while (isdigit_ascii(*p)) {
+ if (num_digits < max_digits) {
+ number = number * 10. + (*p - '0');
+ num_digits++;
+ } else {
+ ++exponent;
+ }
- if (num_digits >= max_digits) // Consume extra decimal digits.
- while (isdigit_ascii(*p)) ++p;
+ p++;
+ p += (tsep != '\0' && *p == tsep);
+ }
- exponent -= num_decimals;
- }
+ // Process decimal part
+ if (*p == decimal) {
+ if (maybe_int != NULL)
+ *maybe_int = 0;
+ p++;
- if (num_digits == 0) {
- *error = ERANGE;
- return 0.0;
+ while (num_digits < max_digits && isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
+ num_decimals++;
}
- // Correct for sign.
- if (negative) number = -number;
+ if (num_digits >= max_digits) // Consume extra decimal digits.
+ while (isdigit_ascii(*p))
+ ++p;
- // Process an exponent string.
- if (toupper_ascii(*p) == toupper_ascii(sci)) {
- if (maybe_int != NULL) *maybe_int = 0;
+ exponent -= num_decimals;
+ }
- // Handle optional sign
- negative = 0;
- switch (*++p) {
- case '-':
- negative = 1; // Fall through to increment pos.
- case '+':
- p++;
- }
+ if (num_digits == 0) {
+ *error = ERANGE;
+ return 0.0;
+ }
- // Process string of digits.
- num_digits = 0;
- n = 0;
- while (num_digits < max_digits && isdigit_ascii(*p)) {
- n = n * 10 + (*p - '0');
- num_digits++;
- p++;
- }
+ // Correct for sign.
+ if (negative)
+ number = -number;
- if (negative)
- exponent -= n;
- else
- exponent += n;
+ // Process an exponent string.
+ if (toupper_ascii(*p) == toupper_ascii(sci)) {
+ if (maybe_int != NULL)
+ *maybe_int = 0;
- // If no digits after the 'e'/'E', un-consume it.
- if (num_digits == 0) p--;
+ // Handle optional sign
+ negative = 0;
+ switch (*++p) {
+ case '-':
+ negative = 1;
+ PD_FALLTHROUGH; // Fall through to increment position.
+ case '+':
+ p++;
+ break;
}
- if (exponent > 308) {
- *error = ERANGE;
- return HUGE_VAL;
- } else if (exponent > 0) {
- number *= e[exponent];
- } else if (exponent < -308) { // Subnormal
- if (exponent < -616) { // Prevent invalid array access.
- number = 0.;
- } else {
- number /= e[-308 - exponent];
- number /= e[308];
- }
+ // Process string of digits.
+ num_digits = 0;
+ int n = 0;
+ while (num_digits < max_digits && isdigit_ascii(*p)) {
+ n = n * 10 + (*p - '0');
+ num_digits++;
+ p++;
+ }
+ if (negative)
+ exponent -= n;
+ else
+ exponent += n;
+
+ // If no digits after the 'e'/'E', un-consume it.
+ if (num_digits == 0)
+ p--;
+ }
+
+ if (exponent > 308) {
+ *error = ERANGE;
+ return HUGE_VAL;
+ } else if (exponent > 0) {
+ number *= e[exponent];
+ } else if (exponent < -308) { // Subnormal
+ if (exponent < -616) { // Prevent invalid array access.
+ number = 0.;
} else {
- number /= e[-exponent];
+ number /= e[-308 - exponent];
+ number /= e[308];
}
- if (number == HUGE_VAL || number == -HUGE_VAL) *error = ERANGE;
+ } else {
+ number /= e[-exponent];
+ }
- if (skip_trailing) {
- // Skip trailing whitespace.
- while (isspace_ascii(*p)) p++;
- }
+ if (number == HUGE_VAL || number == -HUGE_VAL)
+ *error = ERANGE;
- if (endptr) *endptr = p;
- return number;
+ if (skip_trailing) {
+ // Skip trailing whitespace.
+ while (isspace_ascii(*p))
+ p++;
+ }
+
+ if (endptr)
+ *endptr = (char *)p;
+ return number;
}
/* copy a decimal number string with `decimal`, `tsep` as decimal point
@@ -1781,306 +1725,302 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
with a call to `free`.
*/
-char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
- char tsep) {
- const char *p = s;
- size_t length = strlen(s);
- char *s_copy = malloc(length + 1);
- char *dst = s_copy;
- // Skip leading whitespace.
- while (isspace_ascii(*p)) p++;
- // Copy Leading sign
+static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
+ char tsep) {
+ const char *p = s;
+ const size_t length = strlen(s);
+ char *s_copy = malloc(length + 1);
+ char *dst = s_copy;
+ // Skip leading whitespace.
+ while (isspace_ascii(*p))
+ p++;
+ // Copy Leading sign
+ if (*p == '+' || *p == '-') {
+ *dst++ = *p++;
+ }
+ // Copy integer part dropping `tsep`
+ while (isdigit_ascii(*p)) {
+ *dst++ = *p++;
+ p += (tsep != '\0' && *p == tsep);
+ }
+ // Replace `decimal` with '.'
+ if (*p == decimal) {
+ *dst++ = '.';
+ p++;
+ }
+ // Copy fractional part after decimal (if any)
+ while (isdigit_ascii(*p)) {
+ *dst++ = *p++;
+ }
+ // Copy exponent if any
+ if (toupper_ascii(*p) == toupper_ascii('E')) {
+ *dst++ = *p++;
+ // Copy leading exponent sign (if any)
if (*p == '+' || *p == '-') {
- *dst++ = *p++;
+ *dst++ = *p++;
}
- // Copy integer part dropping `tsep`
+ // Copy exponent digits
while (isdigit_ascii(*p)) {
- *dst++ = *p++;
- p += (tsep != '\0' && *p == tsep);
- }
- // Replace `decimal` with '.'
- if (*p == decimal) {
- *dst++ = '.';
- p++;
+ *dst++ = *p++;
}
- // Copy fractional part after decimal (if any)
- while (isdigit_ascii(*p)) {
- *dst++ = *p++;
- }
- // Copy exponent if any
- if (toupper_ascii(*p) == toupper_ascii('E')) {
- *dst++ = *p++;
- // Copy leading exponent sign (if any)
- if (*p == '+' || *p == '-') {
- *dst++ = *p++;
- }
- // Copy exponent digits
- while (isdigit_ascii(*p)) {
- *dst++ = *p++;
- }
- }
- *dst++ = '\0'; // terminate
- if (endpos != NULL)
- *endpos = (char *)p;
- return s_copy;
+ }
+ *dst++ = '\0'; // terminate
+ if (endpos != NULL)
+ *endpos = (char *)p;
+ return s_copy;
}
-
-double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
- int skip_trailing, int *error, int *maybe_int) {
- // 'normalize' representation to C-locale; replace decimal with '.' and
- // remove thousands separator.
- char *endptr;
- char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
- // This is called from a nogil block in parsers.pyx
- // so need to explicitly get GIL before Python calls
- PyGILState_STATE gstate;
- gstate = PyGILState_Ensure();
- char *endpc;
- double r = PyOS_string_to_double(pc, &endpc, 0);
- // PyOS_string_to_double needs to consume the whole string
- if (endpc == pc + strlen(pc)) {
- if (q != NULL) {
- // report endptr from source string (p)
- *q = endptr;
- }
- } else {
- *error = -1;
- if (q != NULL) {
- // p and pc are different len due to tsep removal. Can't report
- // how much it has consumed of p. Just rewind to beginning.
- *q = (char *)p; // TODO(willayd): this could be undefined behavior
- }
- }
- if (maybe_int != NULL) *maybe_int = 0;
- if (PyErr_Occurred() != NULL) *error = -1;
- else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
- PyErr_Clear();
-
- PyGILState_Release(gstate);
- free(pc);
- if (skip_trailing && q != NULL && *q != p) {
- while (isspace_ascii(**q)) {
- (*q)++;
- }
- }
- return r;
+double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
+ char tsep, int skip_trailing, int *error, int *maybe_int) {
+ // 'normalize' representation to C-locale; replace decimal with '.' and
+ // remove thousands separator.
+ char *endptr;
+ char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
+ // This is called from a nogil block in parsers.pyx
+ // so need to explicitly get GIL before Python calls
+ PyGILState_STATE gstate = PyGILState_Ensure();
+ char *endpc;
+ const double r = PyOS_string_to_double(pc, &endpc, 0);
+ // PyOS_string_to_double needs to consume the whole string
+ if (endpc == pc + strlen(pc)) {
+ if (q != NULL) {
+ // report endptr from source string (p)
+ *q = endptr;
+ }
+ } else {
+ *error = -1;
+ if (q != NULL) {
+ // p and pc are different len due to tsep removal. Can't report
+ // how much it has consumed of p. Just rewind to beginning.
+ *q = (char *)p; // TODO(willayd): this could be undefined behavior
+ }
+ }
+ if (maybe_int != NULL)
+ *maybe_int = 0;
+ if (PyErr_Occurred() != NULL)
+ *error = -1;
+ else if (r == Py_HUGE_VAL)
+ *error = (int)Py_HUGE_VAL;
+ PyErr_Clear();
+
+ PyGILState_Release(gstate);
+ free(pc);
+ if (skip_trailing && q != NULL && *q != p) {
+ while (isspace_ascii(**q)) {
+ (*q)++;
+ }
+ }
+ return r;
}
// End of xstrtod code
// ---------------------------------------------------------------------------
void uint_state_init(uint_state *self) {
- self->seen_sint = 0;
- self->seen_uint = 0;
- self->seen_null = 0;
+ self->seen_sint = 0;
+ self->seen_uint = 0;
+ self->seen_null = 0;
}
int uint64_conflict(uint_state *self) {
- return self->seen_uint && (self->seen_sint || self->seen_null);
+ return self->seen_uint && (self->seen_sint || self->seen_null);
}
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
- const char *p = p_item;
- int isneg = 0;
- int64_t number = 0;
- int d;
+ const char *p = p_item;
+ // Skip leading spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ const bool isneg = *p == '-' ? true : false;
+ // Handle sign.
+ if (isneg || (*p == '+')) {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit_ascii(*p)) {
+ // Error...
+ *error = ERROR_NO_DIGITS;
+ return 0;
+ }
- // Skip leading spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
+ int64_t number = 0;
+ if (isneg) {
+ // If number is greater than pre_min, at least one more digit
+ // can be processed without overflowing.
+ int dig_pre_min = -(int_min % 10);
+ int64_t pre_min = int_min / 10;
- // Handle sign.
- if (*p == '-') {
- isneg = 1;
- ++p;
- } else if (*p == '+') {
- p++;
+ // Process the digits.
+ char d = *p;
+ if (tsep != '\0') {
+ while (1) {
+ if (d == tsep) {
+ d = *++p;
+ continue;
+ } else if (!isdigit_ascii(d)) {
+ break;
+ }
+ if ((number > pre_min) ||
+ ((number == pre_min) && (d - '0' <= dig_pre_min))) {
+ number = number * 10 - (d - '0');
+ d = *++p;
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
+ } else {
+ while (isdigit_ascii(d)) {
+ if ((number > pre_min) ||
+ ((number == pre_min) && (d - '0' <= dig_pre_min))) {
+ number = number * 10 - (d - '0');
+ d = *++p;
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
}
+ } else {
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ int64_t pre_max = int_max / 10;
+ int dig_pre_max = int_max % 10;
- // Check that there is a first digit.
- if (!isdigit_ascii(*p)) {
- // Error...
- *error = ERROR_NO_DIGITS;
- return 0;
- }
+ // Process the digits.
+ char d = *p;
+ if (tsep != '\0') {
+ while (1) {
+ if (d == tsep) {
+ d = *++p;
+ continue;
+ } else if (!isdigit_ascii(d)) {
+ break;
+ }
+ if ((number < pre_max) ||
+ ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
- if (isneg) {
- // If number is greater than pre_min, at least one more digit
- // can be processed without overflowing.
- int dig_pre_min = -(int_min % 10);
- int64_t pre_min = int_min / 10;
-
- // Process the digits.
- d = *p;
- if (tsep != '\0') {
- while (1) {
- if (d == tsep) {
- d = *++p;
- continue;
- } else if (!isdigit_ascii(d)) {
- break;
- }
- if ((number > pre_min) ||
- ((number == pre_min) && (d - '0' <= dig_pre_min))) {
- number = number * 10 - (d - '0');
- d = *++p;
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
} else {
- while (isdigit_ascii(d)) {
- if ((number > pre_min) ||
- ((number == pre_min) && (d - '0' <= dig_pre_min))) {
- number = number * 10 - (d - '0');
- d = *++p;
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
+ *error = ERROR_OVERFLOW;
+ return 0;
}
+ }
} else {
- // If number is less than pre_max, at least one more digit
- // can be processed without overflowing.
- int64_t pre_max = int_max / 10;
- int dig_pre_max = int_max % 10;
-
- // Process the digits.
- d = *p;
- if (tsep != '\0') {
- while (1) {
- if (d == tsep) {
- d = *++p;
- continue;
- } else if (!isdigit_ascii(d)) {
- break;
- }
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
+ while (isdigit_ascii(d)) {
+ if ((number < pre_max) ||
+ ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+
} else {
- while (isdigit_ascii(d)) {
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
+ *error = ERROR_OVERFLOW;
+ return 0;
}
+ }
}
+ }
- // Skip trailing spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
+ // Skip trailing spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
- // Did we use up all the characters?
- if (*p) {
- *error = ERROR_INVALID_CHARS;
- return 0;
- }
+ // Did we use up all the characters?
+ if (*p) {
+ *error = ERROR_INVALID_CHARS;
+ return 0;
+ }
- *error = 0;
- return number;
+ *error = 0;
+ return number;
}
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) {
- const char *p = p_item;
- uint64_t pre_max = uint_max / 10;
- int dig_pre_max = uint_max % 10;
- uint64_t number = 0;
- int d;
-
- // Skip leading spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
-
- // Handle sign.
- if (*p == '-') {
- state->seen_sint = 1;
- *error = 0;
+ const char *p = p_item;
+ // Skip leading spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ if (*p == '-') {
+ state->seen_sint = 1;
+ *error = 0;
+ return 0;
+ } else if (*p == '+') {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit_ascii(*p)) {
+ // Error...
+ *error = ERROR_NO_DIGITS;
+ return 0;
+ }
+
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ //
+ // Process the digits.
+ uint64_t number = 0;
+ const uint64_t pre_max = uint_max / 10;
+ const uint64_t dig_pre_max = uint_max % 10;
+ char d = *p;
+ if (tsep != '\0') {
+ while (1) {
+ if (d == tsep) {
+ d = *++p;
+ continue;
+ } else if (!isdigit_ascii(d)) {
+ break;
+ }
+ if ((number < pre_max) ||
+ ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+
+ } else {
+ *error = ERROR_OVERFLOW;
return 0;
- } else if (*p == '+') {
- p++;
+ }
}
+ } else {
+ while (isdigit_ascii(d)) {
+ if ((number < pre_max) ||
+ ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
- // Check that there is a first digit.
- if (!isdigit_ascii(*p)) {
- // Error...
- *error = ERROR_NO_DIGITS;
+ } else {
+ *error = ERROR_OVERFLOW;
return 0;
+ }
}
+ }
- // If number is less than pre_max, at least one more digit
- // can be processed without overflowing.
- //
- // Process the digits.
- d = *p;
- if (tsep != '\0') {
- while (1) {
- if (d == tsep) {
- d = *++p;
- continue;
- } else if (!isdigit_ascii(d)) {
- break;
- }
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- } else {
- while (isdigit_ascii(d)) {
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- }
-
- // Skip trailing spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
+ // Skip trailing spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
- // Did we use up all the characters?
- if (*p) {
- *error = ERROR_INVALID_CHARS;
- return 0;
- }
+ // Did we use up all the characters?
+ if (*p) {
+ *error = ERROR_INVALID_CHARS;
+ return 0;
+ }
- if (number > (uint64_t)int_max) {
- state->seen_uint = 1;
- }
+ if (number > (uint64_t)int_max) {
+ state->seen_uint = 1;
+ }
- *error = 0;
- return number;
+ *error = 0;
+ return number;
}
diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
index 7e5cb53cf8f62..06e3251db8315 100644
--- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
+++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
@@ -14,19 +14,59 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
*/
+// Licence at LICENSES/NUMPY_LICENSE
+
#define NO_IMPORT
#ifndef NPY_NO_DEPRECATED_API
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
+#endif // NPY_NO_DEPRECATED_API
#include
-#include
-#include
-#include
#include "pandas/vendored/numpy/datetime/np_datetime.h"
-
+#include
+#include
+
+#if defined(_WIN32)
+#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#endif
+#include
+#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
+#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
+#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
+#else
+#if defined __has_builtin
+#if __has_builtin(__builtin_add_overflow)
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0,
+ "Overflow checking not detected; please try a newer compiler");
+#endif
+// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
+// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
+#elif __GNUC__ > 7
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
+#endif
+#endif
+
+#define PD_CHECK_OVERFLOW(FUNC) \
+ do { \
+ if ((FUNC) != 0) { \
+ PyGILState_STATE gstate = PyGILState_Ensure(); \
+ PyErr_SetString(PyExc_OverflowError, \
+ "Overflow occurred in npy_datetimestruct_to_datetime"); \
+ PyGILState_Release(gstate); \
+ return -1; \
+ } \
+ } while (0)
const int days_per_month_table[2][12] = {
{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
@@ -36,8 +76,8 @@ const int days_per_month_table[2][12] = {
* Returns 1 if the given year is a leap year, 0 otherwise.
*/
int is_leapyear(npy_int64 year) {
- return (year & 0x3) == 0 && /* year % 4 == 0 */
- ((year % 100) != 0 || (year % 400) == 0);
+ return (year & 0x3) == 0 && /* year % 4 == 0 */
+ ((year % 100) != 0 || (year % 400) == 0);
}
/*
@@ -45,108 +85,108 @@ int is_leapyear(npy_int64 year) {
* the current values are valid.g
*/
void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) {
- int isleap;
-
- /* MINUTES */
- dts->min += minutes;
- while (dts->min < 0) {
- dts->min += 60;
- dts->hour--;
- }
- while (dts->min >= 60) {
- dts->min -= 60;
- dts->hour++;
- }
-
- /* HOURS */
- while (dts->hour < 0) {
- dts->hour += 24;
- dts->day--;
- }
- while (dts->hour >= 24) {
- dts->hour -= 24;
- dts->day++;
- }
-
- /* DAYS */
- if (dts->day < 1) {
- dts->month--;
- if (dts->month < 1) {
- dts->year--;
- dts->month = 12;
- }
- isleap = is_leapyear(dts->year);
- dts->day += days_per_month_table[isleap][dts->month - 1];
- } else if (dts->day > 28) {
- isleap = is_leapyear(dts->year);
- if (dts->day > days_per_month_table[isleap][dts->month - 1]) {
- dts->day -= days_per_month_table[isleap][dts->month - 1];
- dts->month++;
- if (dts->month > 12) {
- dts->year++;
- dts->month = 1;
- }
- }
+ int isleap;
+
+ /* MINUTES */
+ dts->min += minutes;
+ while (dts->min < 0) {
+ dts->min += 60;
+ dts->hour--;
+ }
+ while (dts->min >= 60) {
+ dts->min -= 60;
+ dts->hour++;
+ }
+
+ /* HOURS */
+ while (dts->hour < 0) {
+ dts->hour += 24;
+ dts->day--;
+ }
+ while (dts->hour >= 24) {
+ dts->hour -= 24;
+ dts->day++;
+ }
+
+ /* DAYS */
+ if (dts->day < 1) {
+ dts->month--;
+ if (dts->month < 1) {
+ dts->year--;
+ dts->month = 12;
+ }
+ isleap = is_leapyear(dts->year);
+ dts->day += days_per_month_table[isleap][dts->month - 1];
+ } else if (dts->day > 28) {
+ isleap = is_leapyear(dts->year);
+ if (dts->day > days_per_month_table[isleap][dts->month - 1]) {
+ dts->day -= days_per_month_table[isleap][dts->month - 1];
+ dts->month++;
+ if (dts->month > 12) {
+ dts->year++;
+ dts->month = 1;
+ }
}
+ }
}
/*
* Calculates the days offset from the 1970 epoch.
*/
npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
- int i, month;
- npy_int64 year, days = 0;
- const int *month_lengths;
-
- year = dts->year - 1970;
- days = year * 365;
-
- /* Adjust for leap years */
- if (days >= 0) {
- /*
- * 1968 is the closest leap year before 1970.
- * Exclude the current year, so add 1.
- */
- year += 1;
- /* Add one day for each 4 years */
- days += year / 4;
- /* 1900 is the closest previous year divisible by 100 */
- year += 68;
- /* Subtract one day for each 100 years */
- days -= year / 100;
- /* 1600 is the closest previous year divisible by 400 */
- year += 300;
- /* Add one day for each 400 years */
- days += year / 400;
- } else {
- /*
- * 1972 is the closest later year after 1970.
- * Include the current year, so subtract 2.
- */
- year -= 2;
- /* Subtract one day for each 4 years */
- days += year / 4;
- /* 2000 is the closest later year divisible by 100 */
- year -= 28;
- /* Add one day for each 100 years */
- days -= year / 100;
- /* 2000 is also the closest later year divisible by 400 */
- /* Subtract one day for each 400 years */
- days += year / 400;
- }
-
- month_lengths = days_per_month_table[is_leapyear(dts->year)];
- month = dts->month - 1;
-
- /* Add the months */
- for (i = 0; i < month; ++i) {
- days += month_lengths[i];
- }
+ int i, month;
+ npy_int64 year, days = 0;
+ const int *month_lengths;
- /* Add the days */
- days += dts->day - 1;
+ year = dts->year - 1970;
+ days = year * 365;
- return days;
+ /* Adjust for leap years */
+ if (days >= 0) {
+ /*
+ * 1968 is the closest leap year before 1970.
+ * Exclude the current year, so add 1.
+ */
+ year += 1;
+ /* Add one day for each 4 years */
+ days += year / 4;
+ /* 1900 is the closest previous year divisible by 100 */
+ year += 68;
+ /* Subtract one day for each 100 years */
+ days -= year / 100;
+ /* 1600 is the closest previous year divisible by 400 */
+ year += 300;
+ /* Add one day for each 400 years */
+ days += year / 400;
+ } else {
+ /*
+ * 1972 is the closest later year after 1970.
+ * Include the current year, so subtract 2.
+ */
+ year -= 2;
+ /* Subtract one day for each 4 years */
+ days += year / 4;
+ /* 2000 is the closest later year divisible by 100 */
+ year -= 28;
+ /* Add one day for each 100 years */
+ days -= year / 100;
+ /* 2000 is also the closest later year divisible by 400 */
+ /* Subtract one day for each 400 years */
+ days += year / 400;
+ }
+
+ month_lengths = days_per_month_table[is_leapyear(dts->year)];
+ month = dts->month - 1;
+
+ /* Add the months */
+ for (i = 0; i < month; ++i) {
+ days += month_lengths[i];
+ }
+
+ /* Add the days */
+ days += dts->day - 1;
+
+ return days;
}
/*
@@ -154,62 +194,61 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
* and returns the year.
*/
static npy_int64 days_to_yearsdays(npy_int64 *days_) {
- const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1);
- /* Adjust so it's relative to the year 2000 (divisible by 400) */
- npy_int64 days = (*days_) - (365 * 30 + 7);
- npy_int64 year;
-
- /* Break down the 400 year cycle to get the year and day within the year */
- if (days >= 0) {
- year = 400 * (days / days_per_400years);
- days = days % days_per_400years;
- } else {
- year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
- days = days % days_per_400years;
- if (days < 0) {
- days += days_per_400years;
- }
- }
-
- /* Work out the year/day within the 400 year cycle */
- if (days >= 366) {
- year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
- days = (days - 1) % (100 * 365 + 25 - 1);
- if (days >= 365) {
- year += 4 * ((days + 1) / (4 * 365 + 1));
- days = (days + 1) % (4 * 365 + 1);
- if (days >= 366) {
- year += (days - 1) / 365;
- days = (days - 1) % 365;
- }
- }
- }
-
- *days_ = days;
- return year + 2000;
-}
+ const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1);
+ /* Adjust so it's relative to the year 2000 (divisible by 400) */
+ npy_int64 days = (*days_) - (365 * 30 + 7);
+ npy_int64 year;
+
+ /* Break down the 400 year cycle to get the year and day within the year */
+ if (days >= 0) {
+ year = 400 * (days / days_per_400years);
+ days = days % days_per_400years;
+ } else {
+ year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
+ days = days % days_per_400years;
+ if (days < 0) {
+ days += days_per_400years;
+ }
+ }
+
+ /* Work out the year/day within the 400 year cycle */
+ if (days >= 366) {
+ year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
+ days = (days - 1) % (100 * 365 + 25 - 1);
+ if (days >= 365) {
+ year += 4 * ((days + 1) / (4 * 365 + 1));
+ days = (days + 1) % (4 * 365 + 1);
+ if (days >= 366) {
+ year += (days - 1) / 365;
+ days = (days - 1) % 365;
+ }
+ }
+ }
+ *days_ = days;
+ return year + 2000;
+}
/*
* Fills in the year, month, day in 'dts' based on the days
* offset from 1970.
*/
static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) {
- const int *month_lengths;
- int i;
+ const int *month_lengths;
+ int i;
- dts->year = days_to_yearsdays(&days);
- month_lengths = days_per_month_table[is_leapyear(dts->year)];
+ dts->year = days_to_yearsdays(&days);
+ month_lengths = days_per_month_table[is_leapyear(dts->year)];
- for (i = 0; i < 12; ++i) {
- if (days < month_lengths[i]) {
- dts->month = i + 1;
- dts->day = days + 1;
- return;
- } else {
- days -= month_lengths[i];
- }
+ for (i = 0; i < 12; ++i) {
+ if (days < month_lengths[i]) {
+ dts->month = i + 1;
+ dts->day = (npy_int32)days + 1;
+ return;
+ } else {
+ days -= month_lengths[i];
}
+ }
}
/*
@@ -217,186 +256,271 @@ static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) {
*/
int cmp_npy_datetimestruct(const npy_datetimestruct *a,
const npy_datetimestruct *b) {
- if (a->year > b->year) {
- return 1;
- } else if (a->year < b->year) {
- return -1;
+ if (a->year > b->year) {
+ return 1;
+ } else if (a->year < b->year) {
+ return -1;
+ }
+
+ if (a->month > b->month) {
+ return 1;
+ } else if (a->month < b->month) {
+ return -1;
+ }
+
+ if (a->day > b->day) {
+ return 1;
+ } else if (a->day < b->day) {
+ return -1;
+ }
+
+ if (a->hour > b->hour) {
+ return 1;
+ } else if (a->hour < b->hour) {
+ return -1;
+ }
+
+ if (a->min > b->min) {
+ return 1;
+ } else if (a->min < b->min) {
+ return -1;
+ }
+
+ if (a->sec > b->sec) {
+ return 1;
+ } else if (a->sec < b->sec) {
+ return -1;
+ }
+
+ if (a->us > b->us) {
+ return 1;
+ } else if (a->us < b->us) {
+ return -1;
+ }
+
+ if (a->ps > b->ps) {
+ return 1;
+ } else if (a->ps < b->ps) {
+ return -1;
+ }
+
+ if (a->as > b->as) {
+ return 1;
+ } else if (a->as < b->as) {
+ return -1;
+ }
+
+ return 0;
+}
+/*
+ * Returns the offset from utc of the timezone as a timedelta.
+ * The caller is responsible for ensuring that the tzinfo
+ * attribute exists on the datetime object.
+ *
+ * If the passed object is timezone naive, Py_None is returned.
+ * If extraction of the offset fails, NULL is returned.
+ *
+ * NOTE: This function is not vendored from numpy.
+ */
+PyObject *extract_utc_offset(PyObject *obj) {
+ PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
+ if (tmp == NULL) {
+ return NULL;
+ }
+ if (tmp != Py_None) {
+ PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
+ if (offset == NULL) {
+ Py_DECREF(tmp);
+ return NULL;
}
+ return offset;
+ }
+ return tmp;
+}
- if (a->month > b->month) {
- return 1;
- } else if (a->month < b->month) {
- return -1;
- }
+static inline int scaleYearToEpoch(int64_t year, int64_t *result) {
+ return checked_int64_sub(year, 1970, result);
+}
- if (a->day > b->day) {
- return 1;
- } else if (a->day < b->day) {
- return -1;
- }
+static inline int scaleYearsToMonths(int64_t years, int64_t *result) {
+ return checked_int64_mul(years, 12, result);
+}
- if (a->hour > b->hour) {
- return 1;
- } else if (a->hour < b->hour) {
- return -1;
+static inline int scaleDaysToWeeks(int64_t days, int64_t *result) {
+ if (days >= 0) {
+ *result = days / 7;
+ return 0;
+ } else {
+ int res;
+ int64_t checked_days;
+ if ((res = checked_int64_sub(days, 6, &checked_days))) {
+ return res;
}
- if (a->min > b->min) {
- return 1;
- } else if (a->min < b->min) {
- return -1;
- }
+ *result = checked_days / 7;
+ return 0;
+ }
+}
- if (a->sec > b->sec) {
- return 1;
- } else if (a->sec < b->sec) {
- return -1;
- }
+static inline int scaleDaysToHours(int64_t days, int64_t *result) {
+ return checked_int64_mul(days, 24, result);
+}
- if (a->us > b->us) {
- return 1;
- } else if (a->us < b->us) {
- return -1;
- }
+static inline int scaleHoursToMinutes(int64_t hours, int64_t *result) {
+ return checked_int64_mul(hours, 60, result);
+}
- if (a->ps > b->ps) {
- return 1;
- } else if (a->ps < b->ps) {
- return -1;
- }
+static inline int scaleMinutesToSeconds(int64_t minutes, int64_t *result) {
+ return checked_int64_mul(minutes, 60, result);
+}
- if (a->as > b->as) {
- return 1;
- } else if (a->as < b->as) {
- return -1;
- }
+static inline int scaleSecondsToMilliseconds(int64_t seconds, int64_t *result) {
+ return checked_int64_mul(seconds, 1000, result);
+}
- return 0;
+static inline int scaleSecondsToMicroseconds(int64_t seconds, int64_t *result) {
+ return checked_int64_mul(seconds, 1000000, result);
}
-/*
-* Returns the offset from utc of the timezone as a timedelta.
-* The caller is responsible for ensuring that the tzinfo
-* attribute exists on the datetime object.
-*
-* If the passed object is timezone naive, Py_None is returned.
-* If extraction of the offset fails, NULL is returned.
-*
-* NOTE: This function is not vendored from numpy.
-*/
-PyObject *extract_utc_offset(PyObject *obj) {
- PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
- if (tmp == NULL) {
- return NULL;
- }
- if (tmp != Py_None) {
- PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
- if (offset == NULL) {
- Py_DECREF(tmp);
- return NULL;
- }
- return offset;
- }
- return tmp;
+
+static inline int scaleMicrosecondsToNanoseconds(int64_t microseconds,
+ int64_t *result) {
+ return checked_int64_mul(microseconds, 1000, result);
+}
+
+static inline int scaleMicrosecondsToPicoseconds(int64_t microseconds,
+ int64_t *result) {
+ return checked_int64_mul(microseconds, 1000000, result);
+}
+
+static inline int64_t scalePicosecondsToFemtoseconds(int64_t picoseconds,
+ int64_t *result) {
+ return checked_int64_mul(picoseconds, 1000, result);
+}
+
+static inline int64_t scalePicosecondsToAttoseconds(int64_t picoseconds,
+ int64_t *result) {
+ return checked_int64_mul(picoseconds, 1000000, result);
}
/*
* Converts a datetime from a datetimestruct to a datetime based
- * on a metadata unit. The date is assumed to be valid.
+ * on a metadata unit. Returns -1 on and sets PyErr on error.
*/
npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
const npy_datetimestruct *dts) {
- npy_datetime ret;
+ if ((base == NPY_FR_Y) || (base == NPY_FR_M)) {
+ int64_t years;
+ PD_CHECK_OVERFLOW(scaleYearToEpoch(dts->year, &years));
if (base == NPY_FR_Y) {
- /* Truncate to the year */
- ret = dts->year - 1970;
- } else if (base == NPY_FR_M) {
- /* Truncate to the month */
- ret = 12 * (dts->year - 1970) + (dts->month - 1);
- } else {
- /* Otherwise calculate the number of days to start */
- npy_int64 days = get_datetimestruct_days(dts);
-
- switch (base) {
- case NPY_FR_W:
- /* Truncate to weeks */
- if (days >= 0) {
- ret = days / 7;
- } else {
- ret = (days - 6) / 7;
- }
- break;
- case NPY_FR_D:
- ret = days;
- break;
- case NPY_FR_h:
- ret = days * 24 + dts->hour;
- break;
- case NPY_FR_m:
- ret = (days * 24 + dts->hour) * 60 + dts->min;
- break;
- case NPY_FR_s:
- ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec;
- break;
- case NPY_FR_ms:
- ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000 +
- dts->us / 1000;
- break;
- case NPY_FR_us:
- ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us;
- break;
- case NPY_FR_ns:
- ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000 +
- dts->ps / 1000;
- break;
- case NPY_FR_ps:
- ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000000 +
- dts->ps;
- break;
- case NPY_FR_fs:
- /* only 2.6 hours */
- ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000000 +
- dts->ps) *
- 1000 +
- dts->as / 1000;
- break;
- case NPY_FR_as:
- /* only 9.2 secs */
- ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000000 +
- dts->ps) *
- 1000000 +
- dts->as;
- break;
- default:
- /* Something got corrupted */
- PyErr_SetString(
- PyExc_ValueError,
- "NumPy datetime metadata with corrupt unit value");
- return -1;
- }
- }
- return ret;
+ return years;
+ }
+
+ int64_t months;
+ PD_CHECK_OVERFLOW(scaleYearsToMonths(years, &months));
+
+ int64_t months_adder;
+ PD_CHECK_OVERFLOW(checked_int64_sub(dts->month, 1, &months_adder));
+ PD_CHECK_OVERFLOW(checked_int64_add(months, months_adder, &months));
+
+ if (base == NPY_FR_M) {
+ return months;
+ }
+ }
+
+ const int64_t days = get_datetimestruct_days(dts);
+ if (base == NPY_FR_D) {
+ return days;
+ }
+
+ if (base == NPY_FR_W) {
+ int64_t weeks;
+ PD_CHECK_OVERFLOW(scaleDaysToWeeks(days, &weeks));
+ return weeks;
+ }
+
+ int64_t hours;
+ PD_CHECK_OVERFLOW(scaleDaysToHours(days, &hours));
+ PD_CHECK_OVERFLOW(checked_int64_add(hours, dts->hour, &hours));
+
+ if (base == NPY_FR_h) {
+ return hours;
+ }
+
+ int64_t minutes;
+ PD_CHECK_OVERFLOW(scaleHoursToMinutes(hours, &minutes));
+ PD_CHECK_OVERFLOW(checked_int64_add(minutes, dts->min, &minutes));
+
+ if (base == NPY_FR_m) {
+ return minutes;
+ }
+
+ int64_t seconds;
+ PD_CHECK_OVERFLOW(scaleMinutesToSeconds(minutes, &seconds));
+ PD_CHECK_OVERFLOW(checked_int64_add(seconds, dts->sec, &seconds));
+
+ if (base == NPY_FR_s) {
+ return seconds;
+ }
+
+ if (base == NPY_FR_ms) {
+ int64_t milliseconds;
+ PD_CHECK_OVERFLOW(scaleSecondsToMilliseconds(seconds, &milliseconds));
+ PD_CHECK_OVERFLOW(
+ checked_int64_add(milliseconds, dts->us / 1000, &milliseconds));
+
+ return milliseconds;
+ }
+
+ int64_t microseconds;
+ PD_CHECK_OVERFLOW(scaleSecondsToMicroseconds(seconds, µseconds));
+ PD_CHECK_OVERFLOW(checked_int64_add(microseconds, dts->us, µseconds));
+
+ if (base == NPY_FR_us) {
+ return microseconds;
+ }
+
+ if (base == NPY_FR_ns) {
+ int64_t nanoseconds;
+ PD_CHECK_OVERFLOW(
+ scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds));
+ PD_CHECK_OVERFLOW(
+ checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds));
+
+ return nanoseconds;
+ }
+
+ int64_t picoseconds;
+ PD_CHECK_OVERFLOW(scaleMicrosecondsToPicoseconds(microseconds, &picoseconds));
+ PD_CHECK_OVERFLOW(checked_int64_add(picoseconds, dts->ps, &picoseconds));
+
+ if (base == NPY_FR_ps) {
+ return picoseconds;
+ }
+
+ if (base == NPY_FR_fs) {
+ int64_t femtoseconds;
+ PD_CHECK_OVERFLOW(
+ scalePicosecondsToFemtoseconds(picoseconds, &femtoseconds));
+ PD_CHECK_OVERFLOW(
+ checked_int64_add(femtoseconds, dts->as / 1000, &femtoseconds));
+ return femtoseconds;
+ }
+
+ if (base == NPY_FR_as) {
+ int64_t attoseconds;
+ PD_CHECK_OVERFLOW(scalePicosecondsToAttoseconds(picoseconds, &attoseconds));
+ PD_CHECK_OVERFLOW(checked_int64_add(attoseconds, dts->as, &attoseconds));
+ return attoseconds;
+ }
+
+ /* Something got corrupted */
+ PyGILState_STATE gstate = PyGILState_Ensure();
+ PyErr_SetString(PyExc_ValueError,
+ "NumPy datetime metadata with corrupt unit value");
+ PyGILState_Release(gstate);
+
+ return -1;
}
/*
@@ -408,164 +532,162 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
* for subsequent calls to this command - it is able to deduce that `*d >= 0`.
*/
npy_int64 extract_unit(npy_datetime *d, npy_datetime unit) {
- assert(unit > 0);
- npy_int64 div = *d / unit;
- npy_int64 mod = *d % unit;
- if (mod < 0) {
- mod += unit;
- div -= 1;
- }
- assert(mod >= 0);
- *d = mod;
- return div;
+ assert(unit > 0);
+ npy_int64 div = *d / unit;
+ npy_int64 mod = *d % unit;
+ if (mod < 0) {
+ mod += unit;
+ div -= 1;
+ }
+ assert(mod >= 0);
+ *d = mod;
+ return div;
}
/*
* Converts a datetime based on the given metadata into a datetimestruct
*/
-void pandas_datetime_to_datetimestruct(npy_datetime dt,
- NPY_DATETIMEUNIT base,
+void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base,
npy_datetimestruct *out) {
- npy_int64 perday;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(npy_datetimestruct));
- out->year = 1970;
- out->month = 1;
- out->day = 1;
-
- /*
- * Note that care must be taken with the / and % operators
- * for negative values.
- */
- switch (base) {
- case NPY_FR_Y:
- out->year = 1970 + dt;
- break;
-
- case NPY_FR_M:
- out->year = 1970 + extract_unit(&dt, 12);
- out->month = dt + 1;
- break;
-
- case NPY_FR_W:
- /* A week is 7 days */
- set_datetimestruct_days(dt * 7, out);
- break;
-
- case NPY_FR_D:
- set_datetimestruct_days(dt, out);
- break;
-
- case NPY_FR_h:
- perday = 24LL;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = dt;
- break;
-
- case NPY_FR_m:
- perday = 24LL * 60;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 60);
- out->min = (int)dt;
- break;
-
- case NPY_FR_s:
- perday = 24LL * 60 * 60;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 60 * 60);
- out->min = (int)extract_unit(&dt, 60);
- out->sec = (int)dt;
- break;
-
- case NPY_FR_ms:
- perday = 24LL * 60 * 60 * 1000;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 60);
- out->sec = (int)extract_unit(&dt, 1000LL);
- out->us = (int)(dt * 1000);
- break;
-
- case NPY_FR_us:
- perday = 24LL * 60LL * 60LL * 1000LL * 1000LL;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000);
- out->us = (int)dt;
- break;
-
- case NPY_FR_ns:
- perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000);
- out->us = (int)extract_unit(&dt, 1000LL);
- out->ps = (int)(dt * 1000);
- break;
-
- case NPY_FR_ps:
- perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000);
- out->us = (int)extract_unit(&dt, 1000LL);
- out->ps = (int)(dt * 1000);
- break;
-
- case NPY_FR_fs:
- /* entire range is only +- 2.6 hours */
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000 * 60 * 60);
- if (out->hour < 0) {
- out->year = 1969;
- out->month = 12;
- out->day = 31;
- out->hour += 24;
- assert(out->hour >= 0);
- }
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000);
- out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000);
- out->ps = (int)extract_unit(&dt, 1000LL);
- out->as = (int)(dt * 1000);
- break;
-
- case NPY_FR_as:
- /* entire range is only +- 9.2 seconds */
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000 * 1000);
- if (out->sec < 0) {
- out->year = 1969;
- out->month = 12;
- out->day = 31;
- out->hour = 23;
- out->min = 59;
- out->sec += 60;
- assert(out->sec >= 0);
- }
- out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000);
- out->ps = (int)extract_unit(&dt, 1000LL * 1000);
- out->as = (int)dt;
- break;
-
- default:
- PyErr_SetString(PyExc_RuntimeError,
- "NumPy datetime metadata is corrupted with invalid "
- "base unit");
+ npy_int64 perday;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(npy_datetimestruct));
+ out->year = 1970;
+ out->month = 1;
+ out->day = 1;
+
+ /*
+ * Note that care must be taken with the / and % operators
+ * for negative values.
+ */
+ switch (base) {
+ case NPY_FR_Y:
+ out->year = 1970 + dt;
+ break;
+
+ case NPY_FR_M:
+ out->year = 1970 + extract_unit(&dt, 12);
+ out->month = (npy_int32)dt + 1;
+ break;
+
+ case NPY_FR_W:
+ /* A week is 7 days */
+ set_datetimestruct_days(dt * 7, out);
+ break;
+
+ case NPY_FR_D:
+ set_datetimestruct_days(dt, out);
+ break;
+
+ case NPY_FR_h:
+ perday = 24LL;
+
+ set_datetimestruct_days(extract_unit(&dt, perday), out);
+ out->hour = (npy_int32)dt;
+ break;
+
+ case NPY_FR_m:
+ perday = 24LL * 60;
+
+ set_datetimestruct_days(extract_unit(&dt, perday), out);
+ out->hour = (npy_int32)extract_unit(&dt, 60);
+ out->min = (npy_int32)dt;
+ break;
+
+ case NPY_FR_s:
+ perday = 24LL * 60 * 60;
+
+ set_datetimestruct_days(extract_unit(&dt, perday), out);
+ out->hour = (npy_int32)extract_unit(&dt, 60 * 60);
+ out->min = (npy_int32)extract_unit(&dt, 60);
+ out->sec = (npy_int32)dt;
+ break;
+
+ case NPY_FR_ms:
+ perday = 24LL * 60 * 60 * 1000;
+
+ set_datetimestruct_days(extract_unit(&dt, perday), out);
+ out->hour = (npy_int32)extract_unit(&dt, 1000LL * 60 * 60);
+ out->min = (npy_int32)extract_unit(&dt, 1000LL * 60);
+ out->sec = (npy_int32)extract_unit(&dt, 1000LL);
+ out->us = (npy_int32)(dt * 1000);
+ break;
+
+ case NPY_FR_us:
+ perday = 24LL * 60LL * 60LL * 1000LL * 1000LL;
+
+ set_datetimestruct_days(extract_unit(&dt, perday), out);
+ out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60 * 60);
+ out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60);
+ out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000);
+ out->us = (npy_int32)dt;
+ break;
+
+ case NPY_FR_ns:
+ perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL;
+
+ set_datetimestruct_days(extract_unit(&dt, perday), out);
+ out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60);
+ out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60);
+ out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000);
+ out->us = (npy_int32)extract_unit(&dt, 1000LL);
+ out->ps = (npy_int32)(dt * 1000);
+ break;
+
+ case NPY_FR_ps:
+ perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000;
+
+ set_datetimestruct_days(extract_unit(&dt, perday), out);
+ out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60);
+ out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60);
+ out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000);
+ out->us = (npy_int32)extract_unit(&dt, 1000LL);
+ out->ps = (npy_int32)(dt * 1000);
+ break;
+
+ case NPY_FR_fs:
+ /* entire range is only +- 2.6 hours */
+ out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
+ 1000 * 60 * 60);
+ if (out->hour < 0) {
+ out->year = 1969;
+ out->month = 12;
+ out->day = 31;
+ out->hour += 24;
+ assert(out->hour >= 0);
}
+ out->min =
+ (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60);
+ out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000);
+ out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000);
+ out->ps = (npy_int32)extract_unit(&dt, 1000LL);
+ out->as = (npy_int32)(dt * 1000);
+ break;
+
+ case NPY_FR_as:
+ /* entire range is only +- 9.2 seconds */
+ out->sec =
+ (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000);
+ if (out->sec < 0) {
+ out->year = 1969;
+ out->month = 12;
+ out->day = 31;
+ out->hour = 23;
+ out->min = 59;
+ out->sec += 60;
+ assert(out->sec >= 0);
+ }
+ out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000);
+ out->ps = (npy_int32)extract_unit(&dt, 1000LL * 1000);
+ out->as = (npy_int32)dt;
+ break;
+
+ default:
+ PyErr_SetString(PyExc_RuntimeError,
+ "NumPy datetime metadata is corrupted with invalid "
+ "base unit");
+ }
}
/*
@@ -577,363 +699,358 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt,
void pandas_timedelta_to_timedeltastruct(npy_timedelta td,
NPY_DATETIMEUNIT base,
pandas_timedeltastruct *out) {
- npy_int64 frac;
- npy_int64 sfrac;
- npy_int64 ifrac;
- int sign;
- npy_int64 per_day;
- npy_int64 per_sec;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(pandas_timedeltastruct));
-
- switch (base) {
- case NPY_FR_ns:
-
- per_day = 86400000000000LL;
- per_sec = 1000LL * 1000LL * 1000LL;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = ifrac / (1000LL * 1000LL);
- ifrac -= out->ms * 1000LL * 1000LL;
- out->us = ifrac / 1000LL;
- ifrac -= out->us * 1000LL;
- out->ns = ifrac;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_us:
-
- per_day = 86400000000LL;
- per_sec = 1000LL * 1000LL;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = ifrac / 1000LL;
- ifrac -= out->ms * 1000LL;
- out->us = ifrac / 1L;
- ifrac -= out->us * 1L;
- out->ns = ifrac;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_ms:
-
- per_day = 86400000LL;
- per_sec = 1000LL;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = ifrac;
- out->us = 0;
- out->ns = 0;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_s:
- // special case where we can simplify many expressions bc per_sec=1
-
- per_day = 86400LL;
- per_sec = 1L;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_m:
-
- out->days = td / 1440LL;
- td -= out->days * 1440LL;
- out->hrs = td / 60LL;
- td -= out->hrs * 60LL;
- out->min = td;
-
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- case NPY_FR_h:
- out->days = td / 24LL;
- td -= out->days * 24LL;
- out->hrs = td;
-
- out->min = 0;
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- case NPY_FR_D:
- out->days = td;
- out->hrs = 0;
- out->min = 0;
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- case NPY_FR_W:
- out->days = 7 * td;
- out->hrs = 0;
- out->min = 0;
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- default:
- PyErr_SetString(PyExc_RuntimeError,
- "NumPy timedelta metadata is corrupted with "
- "invalid base unit");
- }
-
- out->seconds = out->hrs * 3600 + out->min * 60 + out->sec;
- out->microseconds = out->ms * 1000 + out->us;
- out->nanoseconds = out->ns;
-}
+ npy_int64 frac;
+ npy_int64 sfrac;
+ npy_int64 ifrac;
+ int sign;
+ npy_int64 per_day;
+ npy_int64 per_sec;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(pandas_timedeltastruct));
+
+ switch (base) {
+ case NPY_FR_ns:
+
+ per_day = 86400000000000LL;
+ per_sec = 1000LL * 1000LL * 1000LL;
+
+ // put frac in seconds
+ if (td < 0 && td % per_sec != 0)
+ frac = td / per_sec - 1;
+ else
+ frac = td / per_sec;
+
+ if (frac < 0) {
+ sign = -1;
+
+ // even fraction
+ if ((-frac % 86400LL) != 0) {
+ out->days = -frac / 86400LL + 1;
+ frac += 86400LL * out->days;
+ } else {
+ frac = -frac;
+ }
+ } else {
+ sign = 1;
+ out->days = 0;
+ }
+ if (frac >= 86400) {
+ out->days += frac / 86400LL;
+ frac -= out->days * 86400LL;
+ }
+
+ if (frac >= 3600) {
+ out->hrs = (npy_int32)(frac / 3600LL);
+ frac -= out->hrs * 3600LL;
+ } else {
+ out->hrs = 0;
+ }
+
+ if (frac >= 60) {
+ out->min = (npy_int32)(frac / 60LL);
+ frac -= out->min * 60LL;
+ } else {
+ out->min = 0;
+ }
+
+ if (frac >= 0) {
+ out->sec = (npy_int32)frac;
+ frac -= out->sec;
+ } else {
+ out->sec = 0;
+ }
+
+ sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec;
+
+ if (sign < 0)
+ out->days = -out->days;
+
+ ifrac = td - (out->days * per_day + sfrac);
+
+ if (ifrac != 0) {
+ out->ms = (npy_int32)(ifrac / (1000LL * 1000LL));
+ ifrac -= out->ms * 1000LL * 1000LL;
+ out->us = (npy_int32)(ifrac / 1000LL);
+ ifrac -= out->us * 1000LL;
+ out->ns = (npy_int32)ifrac;
+ } else {
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ }
+ break;
+
+ case NPY_FR_us:
+
+ per_day = 86400000000LL;
+ per_sec = 1000LL * 1000LL;
+
+ // put frac in seconds
+ if (td < 0 && td % per_sec != 0)
+ frac = td / per_sec - 1;
+ else
+ frac = td / per_sec;
+
+ if (frac < 0) {
+ sign = -1;
+
+ // even fraction
+ if ((-frac % 86400LL) != 0) {
+ out->days = -frac / 86400LL + 1;
+ frac += 86400LL * out->days;
+ } else {
+ frac = -frac;
+ }
+ } else {
+ sign = 1;
+ out->days = 0;
+ }
+
+ if (frac >= 86400) {
+ out->days += frac / 86400LL;
+ frac -= out->days * 86400LL;
+ }
+
+ if (frac >= 3600) {
+ out->hrs = (npy_int32)(frac / 3600LL);
+ frac -= out->hrs * 3600LL;
+ } else {
+ out->hrs = 0;
+ }
+
+ if (frac >= 60) {
+ out->min = (npy_int32)(frac / 60LL);
+ frac -= out->min * 60LL;
+ } else {
+ out->min = 0;
+ }
+
+ if (frac >= 0) {
+ out->sec = (npy_int32)frac;
+ frac -= out->sec;
+ } else {
+ out->sec = 0;
+ }
+
+ sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec;
+
+ if (sign < 0)
+ out->days = -out->days;
+
+ ifrac = td - (out->days * per_day + sfrac);
+
+ if (ifrac != 0) {
+ out->ms = (npy_int32)(ifrac / 1000LL);
+ ifrac -= out->ms * 1000LL;
+ out->us = (npy_int32)(ifrac / 1L);
+ ifrac -= out->us * 1L;
+ out->ns = (npy_int32)ifrac;
+ } else {
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ }
+ break;
+
+ case NPY_FR_ms:
+
+ per_day = 86400000LL;
+ per_sec = 1000LL;
+
+ // put frac in seconds
+ if (td < 0 && td % per_sec != 0)
+ frac = td / per_sec - 1;
+ else
+ frac = td / per_sec;
+
+ if (frac < 0) {
+ sign = -1;
+
+ // even fraction
+ if ((-frac % 86400LL) != 0) {
+ out->days = -frac / 86400LL + 1;
+ frac += 86400LL * out->days;
+ } else {
+ frac = -frac;
+ }
+ } else {
+ sign = 1;
+ out->days = 0;
+ }
+
+ if (frac >= 86400) {
+ out->days += frac / 86400LL;
+ frac -= out->days * 86400LL;
+ }
+
+ if (frac >= 3600) {
+ out->hrs = (npy_int32)(frac / 3600LL);
+ frac -= out->hrs * 3600LL;
+ } else {
+ out->hrs = 0;
+ }
+
+ if (frac >= 60) {
+ out->min = (npy_int32)(frac / 60LL);
+ frac -= out->min * 60LL;
+ } else {
+ out->min = 0;
+ }
+
+ if (frac >= 0) {
+ out->sec = (npy_int32)frac;
+ frac -= out->sec;
+ } else {
+ out->sec = 0;
+ }
+
+ sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec;
+
+ if (sign < 0)
+ out->days = -out->days;
+
+ ifrac = td - (out->days * per_day + sfrac);
+
+ if (ifrac != 0) {
+ out->ms = (npy_int32)ifrac;
+ out->us = 0;
+ out->ns = 0;
+ } else {
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ }
+ break;
+
+ case NPY_FR_s:
+ // special case where we can simplify many expressions bc per_sec=1
+
+ per_day = 86400LL;
+ per_sec = 1L;
+
+ // put frac in seconds
+ if (td < 0 && td % per_sec != 0)
+ frac = td / per_sec - 1;
+ else
+ frac = td / per_sec;
+
+ if (frac < 0) {
+ sign = -1;
+
+ // even fraction
+ if ((-frac % 86400LL) != 0) {
+ out->days = -frac / 86400LL + 1;
+ frac += 86400LL * out->days;
+ } else {
+ frac = -frac;
+ }
+ } else {
+ sign = 1;
+ out->days = 0;
+ }
+
+ if (frac >= 86400) {
+ out->days += frac / 86400LL;
+ frac -= out->days * 86400LL;
+ }
+
+ if (frac >= 3600) {
+ out->hrs = (npy_int32)(frac / 3600LL);
+ frac -= out->hrs * 3600LL;
+ } else {
+ out->hrs = 0;
+ }
+
+ if (frac >= 60) {
+ out->min = (npy_int32)(frac / 60LL);
+ frac -= out->min * 60LL;
+ } else {
+ out->min = 0;
+ }
+
+ if (frac >= 0) {
+ out->sec = (npy_int32)frac;
+ frac -= out->sec;
+ } else {
+ out->sec = 0;
+ }
+
+ sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec;
+
+ if (sign < 0)
+ out->days = -out->days;
+
+ ifrac = td - (out->days * per_day + sfrac);
+
+ if (ifrac != 0) {
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ } else {
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ }
+ break;
+
+ case NPY_FR_m:
+
+ out->days = td / 1440LL;
+ td -= out->days * 1440LL;
+ out->hrs = (npy_int32)(td / 60LL);
+ td -= out->hrs * 60LL;
+ out->min = (npy_int32)td;
+
+ out->sec = 0;
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ break;
+
+ case NPY_FR_h:
+ out->days = td / 24LL;
+ td -= out->days * 24LL;
+ out->hrs = (npy_int32)td;
+
+ out->min = 0;
+ out->sec = 0;
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ break;
+
+ case NPY_FR_D:
+ out->days = td;
+ out->hrs = 0;
+ out->min = 0;
+ out->sec = 0;
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ break;
+
+ case NPY_FR_W:
+ out->days = 7 * td;
+ out->hrs = 0;
+ out->min = 0;
+ out->sec = 0;
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ break;
+
+ default:
+ PyErr_SetString(PyExc_RuntimeError,
+ "NumPy timedelta metadata is corrupted with "
+ "invalid base unit");
+ }
+
+ out->seconds = out->hrs * 3600 + out->min * 60 + out->sec;
+ out->microseconds = out->ms * 1000 + out->us;
+ out->nanoseconds = out->ns;
+}
/*
* This function returns a pointer to the DateTimeMetaData
@@ -943,5 +1060,5 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td,
*/
PyArray_DatetimeMetaData
get_datetime_metadata_from_dtype(PyArray_Descr *dtype) {
- return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta);
+ return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta);
}
diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c
index 629d88ca6f589..a46f5bc467c5d 100644
--- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c
+++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c
@@ -19,25 +19,26 @@ This file implements string parsing and creation for NumPy datetime.
*/
+// LICENSES/NUMPY_LICENSE
+
#define PY_SSIZE_T_CLEAN
#define NO_IMPORT
#ifndef NPY_NO_DEPRECATED_API
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
+#endif // NPY_NO_DEPRECATED_API
#include
#include
-#include
-#include
#include
+#include
+#include "pandas/portable.h"
#include "pandas/vendored/numpy/datetime/np_datetime.h"
#include "pandas/vendored/numpy/datetime/np_datetime_strings.h"
-
/*
* Parses (almost) standard ISO 8601 date strings. The differences are:
*
@@ -68,22 +69,19 @@ This file implements string parsing and creation for NumPy datetime.
*/
typedef enum {
- COMPARISON_SUCCESS,
- COMPLETED_PARTIAL_MATCH,
- COMPARISON_ERROR
+ COMPARISON_SUCCESS,
+ COMPLETED_PARTIAL_MATCH,
+ COMPARISON_ERROR
} DatetimePartParseResult;
// This function will advance the pointer on format
// and decrement characters_remaining by n on success
// On failure will return COMPARISON_ERROR without incrementing
// If `format_requirement` is PARTIAL_MATCH, and the `format` string has
// been exhausted, then return COMPLETED_PARTIAL_MATCH.
-static DatetimePartParseResult compare_format(
- const char **format,
- int *characters_remaining,
- const char *compare_to,
- int n,
- const FormatRequirement format_requirement
-) {
+static DatetimePartParseResult
+compare_format(const char **format, int *characters_remaining,
+ const char *compare_to, int n,
+ const FormatRequirement format_requirement) {
if (format_requirement == INFER_FORMAT) {
return COMPARISON_SUCCESS;
}
@@ -111,636 +109,651 @@ static DatetimePartParseResult compare_format(
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
- NPY_DATETIMEUNIT *out_bestunit,
- int *out_local, int *out_tzoffset,
- const char* format, int format_len,
+ NPY_DATETIMEUNIT *out_bestunit, int *out_local,
+ int *out_tzoffset, const char *format,
+ int format_len,
FormatRequirement format_requirement) {
- if (len < 0 || format_len < 0)
- goto parse_error;
- int year_leap = 0;
- int i, numdigits;
- const char *substr;
- int sublen;
- NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC;
- DatetimePartParseResult comparison;
-
- /* If year-month-day are separated by a valid separator,
- * months/days without leading zeroes will be parsed
- * (though not iso8601). If the components aren't separated,
- * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are
- * forbidden here (but parsed as YYMMDD elsewhere).
- */
- int has_ymd_sep = 0;
- char ymd_sep = '\0';
- char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '};
- int valid_ymd_sep_len = sizeof(valid_ymd_sep);
-
- /* hour-minute-second may or may not separated by ':'. If not, then
- * each component must be 2 digits. */
- int has_hms_sep = 0;
- int hour_was_2_digits = 0;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(npy_datetimestruct));
- out->month = 1;
- out->day = 1;
-
- substr = str;
- sublen = len;
-
- /* Skip leading whitespace */
- while (sublen > 0 && isspace(*substr)) {
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- }
-
- /* Leading '-' sign for negative year */
- if (*substr == '-') {
- ++substr;
- --sublen;
- }
-
- if (sublen == 0) {
- goto parse_error;
- }
-
- /* PARSE THE YEAR (4 digits) */
- comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement);
+ if (len < 0 || format_len < 0)
+ goto parse_error;
+ int year_leap = 0;
+ int i, numdigits;
+ const char *substr;
+ int sublen;
+ NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC;
+ DatetimePartParseResult comparison;
+
+ /* If year-month-day are separated by a valid separator,
+ * months/days without leading zeroes will be parsed
+ * (though not iso8601). If the components aren't separated,
+ * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are
+ * forbidden here (but parsed as YYMMDD elsewhere).
+ */
+ int has_ymd_sep = 0;
+ char ymd_sep = '\0';
+ char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '};
+ int valid_ymd_sep_len = sizeof(valid_ymd_sep);
+
+ /* hour-minute-second may or may not separated by ':'. If not, then
+ * each component must be 2 digits. */
+ int has_hms_sep = 0;
+ int hour_was_2_digits = 0;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(npy_datetimestruct));
+ out->month = 1;
+ out->day = 1;
+
+ substr = str;
+ sublen = len;
+
+ /* Skip leading whitespace */
+ while (sublen > 0 && isspace(*substr)) {
+ ++substr;
+ --sublen;
+ comparison =
+ compare_format(&format, &format_len, " ", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
- goto parse_error;
+ goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ goto finish;
}
+ }
- out->year = 0;
- if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
- isdigit(substr[2]) && isdigit(substr[3])) {
- out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') +
- 10 * (substr[2] - '0') + (substr[3] - '0');
+ /* Leading '-' sign for negative year */
+ if (*substr == '-') {
+ ++substr;
+ --sublen;
+ }
- substr += 4;
- sublen -= 4;
- }
+ if (sublen == 0) {
+ goto parse_error;
+ }
- /* Negate the year if necessary */
- if (str[0] == '-') {
- out->year = -out->year;
- }
- /* Check whether it's a leap-year */
- year_leap = is_leapyear(out->year);
+ /* PARSE THE YEAR (4 digits) */
+ comparison =
+ compare_format(&format, &format_len, "%Y", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
- /* Next character must be a separator, start of month, or end of string */
- if (sublen == 0) {
- if (out_local != NULL) {
- *out_local = 0;
- }
- if (format_len) {
- goto parse_error;
- }
- bestunit = NPY_FR_Y;
- goto finish;
- }
+ out->year = 0;
+ if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
+ isdigit(substr[2]) && isdigit(substr[3])) {
+ out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') +
+ 10 * (substr[2] - '0') + (substr[3] - '0');
- if (!isdigit(*substr)) {
- for (i = 0; i < valid_ymd_sep_len; ++i) {
- if (*substr == valid_ymd_sep[i]) {
- break;
- }
- }
- if (i == valid_ymd_sep_len) {
- goto parse_error;
- }
- has_ymd_sep = 1;
- ymd_sep = valid_ymd_sep[i];
- ++substr;
- --sublen;
+ substr += 4;
+ sublen -= 4;
+ }
- comparison = compare_format(&format, &format_len, &ymd_sep, 1,
- format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* Cannot have trailing separator */
- if (sublen == 0 || !isdigit(*substr)) {
- goto parse_error;
- }
- }
+ /* Negate the year if necessary */
+ if (str[0] == '-') {
+ out->year = -out->year;
+ }
+ /* Check whether it's a leap-year */
+ year_leap = is_leapyear(out->year);
- /* PARSE THE MONTH */
- comparison = compare_format(&format, &format_len, "%m", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ /* Next character must be a separator, start of month, or end of string */
+ if (sublen == 0) {
+ if (out_local != NULL) {
+ *out_local = 0;
}
- /* First digit required */
- out->month = (*substr - '0');
- ++substr;
- --sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->month = 10 * out->month + (*substr - '0');
- ++substr;
- --sublen;
- } else if (!has_ymd_sep) {
- goto parse_error;
- }
- if (out->month < 1 || out->month > 12) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Month out of range in datetime string \"%s\"", str);
- }
- goto error;
+ if (format_len) {
+ goto parse_error;
}
+ bestunit = NPY_FR_Y;
+ goto finish;
+ }
- /* Next character must be the separator, start of day, or end of string */
- if (sublen == 0) {
- bestunit = NPY_FR_M;
- /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */
- if (!has_ymd_sep) {
- goto parse_error;
- }
- if (format_len) {
- goto parse_error;
- }
- if (out_local != NULL) {
- *out_local = 0;
- }
- goto finish;
+ if (!isdigit(*substr)) {
+ for (i = 0; i < valid_ymd_sep_len; ++i) {
+ if (*substr == valid_ymd_sep[i]) {
+ break;
+ }
}
-
- if (has_ymd_sep) {
- /* Must have separator, but cannot be trailing */
- if (*substr != ymd_sep || sublen == 1) {
- goto parse_error;
- }
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, &ymd_sep, 1,
- format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
+ if (i == valid_ymd_sep_len) {
+ goto parse_error;
}
+ has_ymd_sep = 1;
+ ymd_sep = valid_ymd_sep[i];
+ ++substr;
+ --sublen;
- /* PARSE THE DAY */
- comparison = compare_format(&format, &format_len, "%d", 2, format_requirement);
+ comparison =
+ compare_format(&format, &format_len, &ymd_sep, 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
- goto parse_error;
+ goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ goto finish;
}
- /* First digit required */
- if (!isdigit(*substr)) {
- goto parse_error;
+ /* Cannot have trailing separator */
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_error;
}
- out->day = (*substr - '0');
+ }
+
+ /* PARSE THE MONTH */
+ comparison =
+ compare_format(&format, &format_len, "%m", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ /* First digit required */
+ out->month = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->month = 10 * out->month + (*substr - '0');
++substr;
--sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->day = 10 * out->day + (*substr - '0');
- ++substr;
- --sublen;
- } else if (!has_ymd_sep) {
- goto parse_error;
- }
- if (out->day < 1 ||
- out->day > days_per_month_table[year_leap][out->month - 1]) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Day out of range in datetime string \"%s\"", str);
- }
- goto error;
+ } else if (!has_ymd_sep) {
+ goto parse_error;
+ }
+ if (out->month < 1 || out->month > 12) {
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Month out of range in datetime string \"%s\"", str);
}
+ goto error;
+ }
- /* Next character must be a 'T', ' ', or end of string */
- if (sublen == 0) {
- if (out_local != NULL) {
- *out_local = 0;
- }
- if (format_len) {
- goto parse_error;
- }
- bestunit = NPY_FR_D;
- goto finish;
+ /* Next character must be the separator, start of day, or end of string */
+ if (sublen == 0) {
+ bestunit = NPY_FR_M;
+ /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */
+ if (!has_ymd_sep) {
+ goto parse_error;
}
+ if (format_len) {
+ goto parse_error;
+ }
+ if (out_local != NULL) {
+ *out_local = 0;
+ }
+ goto finish;
+ }
- if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
- goto parse_error;
+ if (has_ymd_sep) {
+ /* Must have separator, but cannot be trailing */
+ if (*substr != ymd_sep || sublen == 1) {
+ goto parse_error;
}
- comparison = compare_format(&format, &format_len, substr, 1, format_requirement);
+ ++substr;
+ --sublen;
+ comparison =
+ compare_format(&format, &format_len, &ymd_sep, 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
- goto parse_error;
+ goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ goto finish;
}
+ }
+
+ /* PARSE THE DAY */
+ comparison =
+ compare_format(&format, &format_len, "%d", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ /* First digit required */
+ if (!isdigit(*substr)) {
+ goto parse_error;
+ }
+ out->day = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->day = 10 * out->day + (*substr - '0');
++substr;
--sublen;
+ } else if (!has_ymd_sep) {
+ goto parse_error;
+ }
+ if (out->day < 1 ||
+ out->day > days_per_month_table[year_leap][out->month - 1]) {
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Day out of range in datetime string \"%s\"", str);
+ }
+ goto error;
+ }
- /* PARSE THE HOURS */
- comparison = compare_format(&format, &format_len, "%H", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ /* Next character must be a 'T', ' ', or end of string */
+ if (sublen == 0) {
+ if (out_local != NULL) {
+ *out_local = 0;
}
- /* First digit required */
- if (!isdigit(*substr)) {
- goto parse_error;
+ if (format_len) {
+ goto parse_error;
}
- out->hour = (*substr - '0');
+ bestunit = NPY_FR_D;
+ goto finish;
+ }
+
+ if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
+ goto parse_error;
+ }
+ comparison =
+ compare_format(&format, &format_len, substr, 1, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ ++substr;
+ --sublen;
+
+ /* PARSE THE HOURS */
+ comparison =
+ compare_format(&format, &format_len, "%H", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ /* First digit required */
+ if (!isdigit(*substr)) {
+ goto parse_error;
+ }
+ out->hour = (*substr - '0');
+ bestunit = NPY_FR_h;
+ ++substr;
+ --sublen;
+ /* Second digit optional */
+ if (isdigit(*substr)) {
+ hour_was_2_digits = 1;
+ out->hour = 10 * out->hour + (*substr - '0');
++substr;
--sublen;
- /* Second digit optional */
- if (isdigit(*substr)) {
- hour_was_2_digits = 1;
- out->hour = 10 * out->hour + (*substr - '0');
- ++substr;
- --sublen;
- if (out->hour >= 24) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Hours out of range in datetime string \"%s\"",
- str);
- }
- goto error;
- }
+ if (out->hour >= 24) {
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Hours out of range in datetime string \"%s\"", str);
+ }
+ goto error;
}
+ }
- /* Next character must be a ':' or the end of the string */
- if (sublen == 0) {
- if (!hour_was_2_digits) {
- goto parse_error;
- }
- if (format_len) {
- goto parse_error;
- }
- bestunit = NPY_FR_h;
- goto finish;
+ /* Next character must be a ':' or the end of the string */
+ if (sublen == 0) {
+ if (!hour_was_2_digits) {
+ goto parse_error;
}
-
- if (*substr == ':') {
- has_hms_sep = 1;
- ++substr;
- --sublen;
- /* Cannot have a trailing separator */
- if (sublen == 0 || !isdigit(*substr)) {
- goto parse_error;
- }
- comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- } else if (!isdigit(*substr)) {
- if (!hour_was_2_digits) {
- goto parse_error;
- }
- goto parse_timezone;
+ if (format_len) {
+ goto parse_error;
}
+ bestunit = NPY_FR_h;
+ goto finish;
+ }
- /* PARSE THE MINUTES */
- comparison = compare_format(&format, &format_len, "%M", 2, format_requirement);
+ if (*substr == ':') {
+ has_hms_sep = 1;
+ ++substr;
+ --sublen;
+ /* Cannot have a trailing separator */
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_error;
+ }
+ comparison =
+ compare_format(&format, &format_len, ":", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
- goto parse_error;
+ goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ goto finish;
}
- /* First digit required */
- out->min = (*substr - '0');
- ++substr;
- --sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->min = 10 * out->min + (*substr - '0');
- ++substr;
- --sublen;
- if (out->min >= 60) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Minutes out of range in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (!has_hms_sep) {
- goto parse_error;
+ } else if (!isdigit(*substr)) {
+ if (!hour_was_2_digits) {
+ goto parse_error;
}
+ goto parse_timezone;
+ }
- if (sublen == 0) {
- bestunit = NPY_FR_m;
- if (format_len) {
- goto parse_error;
- }
- goto finish;
+ /* PARSE THE MINUTES */
+ comparison =
+ compare_format(&format, &format_len, "%M", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ /* First digit required */
+ out->min = (*substr - '0');
+ bestunit = NPY_FR_m;
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->min = 10 * out->min + (*substr - '0');
+ ++substr;
+ --sublen;
+ if (out->min >= 60) {
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Minutes out of range in datetime string \"%s\"", str);
+ }
+ goto error;
}
+ } else if (!has_hms_sep) {
+ goto parse_error;
+ }
- /* If we make it through this condition block, then the next
- * character is a digit. */
- if (has_hms_sep && *substr == ':') {
- comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- ++substr;
- --sublen;
- /* Cannot have a trailing ':' */
- if (sublen == 0 || !isdigit(*substr)) {
- goto parse_error;
- }
- } else if (!has_hms_sep && isdigit(*substr)) {
- } else {
- goto parse_timezone;
+ if (sublen == 0) {
+ bestunit = NPY_FR_m;
+ if (format_len) {
+ goto parse_error;
}
+ goto finish;
+ }
- /* PARSE THE SECONDS */
- comparison = compare_format(&format, &format_len, "%S", 2, format_requirement);
+ /* If we make it through this condition block, then the next
+ * character is a digit. */
+ if (has_hms_sep && *substr == ':') {
+ comparison =
+ compare_format(&format, &format_len, ":", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
- goto parse_error;
+ goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ goto finish;
}
- /* First digit required */
- out->sec = (*substr - '0');
++substr;
--sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->sec = 10 * out->sec + (*substr - '0');
- ++substr;
- --sublen;
- if (out->sec >= 60) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Seconds out of range in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (!has_hms_sep) {
- goto parse_error;
+ /* Cannot have a trailing ':' */
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_error;
}
+ } else if (!has_hms_sep && isdigit(*substr)) {
+ } else {
+ goto parse_timezone;
+ }
- /* Next character may be a '.' indicating fractional seconds */
- if (sublen > 0 && *substr == '.') {
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, ".", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- } else {
- bestunit = NPY_FR_s;
- goto parse_timezone;
+ /* PARSE THE SECONDS */
+ comparison =
+ compare_format(&format, &format_len, "%S", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ /* First digit required */
+ out->sec = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->sec = 10 * out->sec + (*substr - '0');
+ ++substr;
+ --sublen;
+ if (out->sec >= 60) {
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Seconds out of range in datetime string \"%s\"", str);
+ }
+ goto error;
}
+ } else if (!has_hms_sep) {
+ goto parse_error;
+ }
- /* PARSE THE MICROSECONDS (0 to 6 digits) */
- comparison = compare_format(&format, &format_len, "%f", 2, format_requirement);
+ /* Next character may be a '.' indicating fractional seconds */
+ if (sublen > 0 && *substr == '.') {
+ ++substr;
+ --sublen;
+ comparison =
+ compare_format(&format, &format_len, ".", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
- goto parse_error;
+ goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- numdigits = 0;
- for (i = 0; i < 6; ++i) {
- out->us *= 10;
- if (sublen > 0 && isdigit(*substr)) {
- out->us += (*substr - '0');
- ++substr;
- --sublen;
- ++numdigits;
- }
- }
-
- if (sublen == 0 || !isdigit(*substr)) {
- if (numdigits > 3) {
- bestunit = NPY_FR_us;
- } else {
- bestunit = NPY_FR_ms;
- }
- goto parse_timezone;
+ goto finish;
}
+ } else {
+ bestunit = NPY_FR_s;
+ goto parse_timezone;
+ }
- /* PARSE THE PICOSECONDS (0 to 6 digits) */
- numdigits = 0;
- for (i = 0; i < 6; ++i) {
- out->ps *= 10;
- if (sublen > 0 && isdigit(*substr)) {
- out->ps += (*substr - '0');
- ++substr;
- --sublen;
- ++numdigits;
- }
+ /* PARSE THE MICROSECONDS (0 to 6 digits) */
+ comparison =
+ compare_format(&format, &format_len, "%f", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ numdigits = 0;
+ for (i = 0; i < 6; ++i) {
+ out->us *= 10;
+ if (sublen > 0 && isdigit(*substr)) {
+ out->us += (*substr - '0');
+ ++substr;
+ --sublen;
+ ++numdigits;
}
+ }
- if (sublen == 0 || !isdigit(*substr)) {
- if (numdigits > 3) {
- bestunit = NPY_FR_ps;
- } else {
- bestunit = NPY_FR_ns;
- }
- goto parse_timezone;
+ if (sublen == 0 || !isdigit(*substr)) {
+ if (numdigits > 3) {
+ bestunit = NPY_FR_us;
+ } else {
+ bestunit = NPY_FR_ms;
}
+ goto parse_timezone;
+ }
- /* PARSE THE ATTOSECONDS (0 to 6 digits) */
- numdigits = 0;
- for (i = 0; i < 6; ++i) {
- out->as *= 10;
- if (sublen > 0 && isdigit(*substr)) {
- out->as += (*substr - '0');
- ++substr;
- --sublen;
- ++numdigits;
- }
+ /* PARSE THE PICOSECONDS (0 to 6 digits) */
+ numdigits = 0;
+ for (i = 0; i < 6; ++i) {
+ out->ps *= 10;
+ if (sublen > 0 && isdigit(*substr)) {
+ out->ps += (*substr - '0');
+ ++substr;
+ --sublen;
+ ++numdigits;
}
+ }
+ if (sublen == 0 || !isdigit(*substr)) {
if (numdigits > 3) {
- bestunit = NPY_FR_as;
+ bestunit = NPY_FR_ps;
} else {
- bestunit = NPY_FR_fs;
+ bestunit = NPY_FR_ns;
}
+ goto parse_timezone;
+ }
-parse_timezone:
- /* trim any whitespace between time/timezone */
- while (sublen > 0 && isspace(*substr)) {
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
+ /* PARSE THE ATTOSECONDS (0 to 6 digits) */
+ numdigits = 0;
+ for (i = 0; i < 6; ++i) {
+ out->as *= 10;
+ if (sublen > 0 && isdigit(*substr)) {
+ out->as += (*substr - '0');
+ ++substr;
+ --sublen;
+ ++numdigits;
}
+ }
- if (sublen == 0) {
- // Unlike NumPy, treating no time zone as naive
- if (format_len > 0) {
- goto parse_error;
- }
- goto finish;
- }
+ if (numdigits > 3) {
+ bestunit = NPY_FR_as;
+ } else {
+ bestunit = NPY_FR_fs;
+ }
- /* UTC specifier */
- if (*substr == 'Z') {
- comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* "Z" should be equivalent to tz offset "+00:00" */
- if (out_local != NULL) {
- *out_local = 1;
- }
+parse_timezone:
+ /* trim any whitespace between time/timezone */
+ while (sublen > 0 && isspace(*substr)) {
+ ++substr;
+ --sublen;
+ comparison =
+ compare_format(&format, &format_len, " ", 1, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ }
- if (out_tzoffset != NULL) {
- *out_tzoffset = 0;
- }
+ if (sublen == 0) {
+ // Unlike NumPy, treating no time zone as naive
+ if (format_len > 0) {
+ goto parse_error;
+ }
+ goto finish;
+ }
- if (sublen == 1) {
- if (format_len > 0) {
- goto parse_error;
- }
- goto finish;
- } else {
- ++substr;
- --sublen;
- }
- } else if (*substr == '-' || *substr == '+') {
- comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* Time zone offset */
- int offset_neg = 0, offset_hour = 0, offset_minute = 0;
+ /* UTC specifier */
+ if (*substr == 'Z') {
+ comparison =
+ compare_format(&format, &format_len, "%z", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ /* "Z" should be equivalent to tz offset "+00:00" */
+ if (out_local != NULL) {
+ *out_local = 1;
+ }
- /*
- * Since "local" means local with respect to the current
- * machine, we say this is non-local.
- */
+ if (out_tzoffset != NULL) {
+ *out_tzoffset = 0;
+ }
- if (*substr == '-') {
- offset_neg = 1;
- }
- ++substr;
- --sublen;
+ if (sublen == 1) {
+ if (format_len > 0) {
+ goto parse_error;
+ }
+ goto finish;
+ } else {
+ ++substr;
+ --sublen;
+ }
+ } else if (*substr == '-' || *substr == '+') {
+ comparison =
+ compare_format(&format, &format_len, "%z", 2, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
+ }
+ /* Time zone offset */
+ int offset_neg = 0, offset_hour = 0, offset_minute = 0;
- /* The hours offset */
- if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
- offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0');
- substr += 2;
- sublen -= 2;
- if (offset_hour >= 24) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Timezone hours offset out of range "
- "in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (sublen >= 1 && isdigit(substr[0])) {
- offset_hour = substr[0] - '0';
- ++substr;
- --sublen;
- } else {
- goto parse_error;
- }
+ /*
+ * Since "local" means local with respect to the current
+ * machine, we say this is non-local.
+ */
- /* The minutes offset is optional */
- if (sublen > 0) {
- /* Optional ':' */
- if (*substr == ':') {
- ++substr;
- --sublen;
- }
-
- /* The minutes offset (at the end of the string) */
- if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
- offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0');
- substr += 2;
- sublen -= 2;
- if (offset_minute >= 60) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Timezone minutes offset out of range "
- "in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (sublen >= 1 && isdigit(substr[0])) {
- offset_minute = substr[0] - '0';
- ++substr;
- --sublen;
- } else {
- goto parse_error;
- }
- }
+ if (*substr == '-') {
+ offset_neg = 1;
+ }
+ ++substr;
+ --sublen;
- /* Apply the time zone offset */
- if (offset_neg) {
- offset_hour = -offset_hour;
- offset_minute = -offset_minute;
- }
- if (out_local != NULL) {
- *out_local = 1;
- // Unlike NumPy, do not change internal value to local time
- *out_tzoffset = 60 * offset_hour + offset_minute;
+ /* The hours offset */
+ if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
+ offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0');
+ substr += 2;
+ sublen -= 2;
+ if (offset_hour >= 24) {
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Timezone hours offset out of range "
+ "in datetime string \"%s\"",
+ str);
}
+ goto error;
+ }
+ } else if (sublen >= 1 && isdigit(substr[0])) {
+ offset_hour = substr[0] - '0';
+ ++substr;
+ --sublen;
+ } else {
+ goto parse_error;
}
- /* Skip trailing whitespace */
- while (sublen > 0 && isspace(*substr)) {
+ /* The minutes offset is optional */
+ if (sublen > 0) {
+ /* Optional ':' */
+ if (*substr == ':') {
++substr;
--sublen;
- comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
+ }
+
+ /* The minutes offset (at the end of the string) */
+ if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
+ offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0');
+ substr += 2;
+ sublen -= 2;
+ if (offset_minute >= 60) {
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Timezone minutes offset out of range "
+ "in datetime string \"%s\"",
+ str);
+ }
+ goto error;
}
+ } else if (sublen >= 1 && isdigit(substr[0])) {
+ offset_minute = substr[0] - '0';
+ ++substr;
+ --sublen;
+ } else {
+ goto parse_error;
+ }
}
- if ((sublen != 0) || (format_len != 0)) {
- goto parse_error;
+ /* Apply the time zone offset */
+ if (offset_neg) {
+ offset_hour = -offset_hour;
+ offset_minute = -offset_minute;
+ }
+ if (out_local != NULL) {
+ *out_local = 1;
+ // Unlike NumPy, do not change internal value to local time
+ *out_tzoffset = 60 * offset_hour + offset_minute;
}
+ }
-finish:
- if (out_bestunit != NULL) {
- *out_bestunit = bestunit;
+ /* Skip trailing whitespace */
+ while (sublen > 0 && isspace(*substr)) {
+ ++substr;
+ --sublen;
+ comparison =
+ compare_format(&format, &format_len, " ", 1, format_requirement);
+ if (comparison == COMPARISON_ERROR) {
+ goto parse_error;
+ } else if (comparison == COMPLETED_PARTIAL_MATCH) {
+ goto finish;
}
- return 0;
+ }
+
+ if ((sublen != 0) || (format_len != 0)) {
+ goto parse_error;
+ }
+
+finish:
+ if (out_bestunit != NULL) {
+ *out_bestunit = bestunit;
+ }
+ return 0;
parse_error:
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Error parsing datetime string \"%s\" at position %d", str,
- (int)(substr - str));
- }
- return -1;
+ if (want_exc) {
+ PyErr_Format(PyExc_ValueError,
+ "Error parsing datetime string \"%s\" at position %d", str,
+ (int)(substr - str));
+ }
+ return -1;
error:
- return -1;
+ return -1;
}
/*
@@ -748,56 +761,66 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
* objects with the given local and unit settings.
*/
int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
- int len = 0;
-
- switch (base) {
- /* Generic units can only be used to represent NaT */
- /* return 4;*/
- case NPY_FR_as:
- len += 3; /* "###" */
- case NPY_FR_fs:
- len += 3; /* "###" */
- case NPY_FR_ps:
- len += 3; /* "###" */
- case NPY_FR_ns:
- len += 3; /* "###" */
- case NPY_FR_us:
- len += 3; /* "###" */
- case NPY_FR_ms:
- len += 4; /* ".###" */
- case NPY_FR_s:
- len += 3; /* ":##" */
- case NPY_FR_m:
- len += 3; /* ":##" */
- case NPY_FR_h:
- len += 3; /* "T##" */
- case NPY_FR_D:
- case NPY_FR_W:
- len += 3; /* "-##" */
- case NPY_FR_M:
- len += 3; /* "-##" */
- case NPY_FR_Y:
- len += 21; /* 64-bit year */
- break;
- default:
- len += 3; /* handle the now defunct NPY_FR_B */
- break;
- }
+ int len = 0;
+
+ switch (base) {
+ /* Generic units can only be used to represent NaT */
+ /* return 4;*/
+ case NPY_FR_as:
+ len += 3; /* "###" */
+ PD_FALLTHROUGH;
+ case NPY_FR_fs:
+ len += 3; /* "###" */
+ PD_FALLTHROUGH;
+ case NPY_FR_ps:
+ len += 3; /* "###" */
+ PD_FALLTHROUGH;
+ case NPY_FR_ns:
+ len += 3; /* "###" */
+ PD_FALLTHROUGH;
+ case NPY_FR_us:
+ len += 3; /* "###" */
+ PD_FALLTHROUGH;
+ case NPY_FR_ms:
+ len += 4; /* ".###" */
+ PD_FALLTHROUGH;
+ case NPY_FR_s:
+ len += 3; /* ":##" */
+ PD_FALLTHROUGH;
+ case NPY_FR_m:
+ len += 3; /* ":##" */
+ PD_FALLTHROUGH;
+ case NPY_FR_h:
+ len += 3; /* "T##" */
+ PD_FALLTHROUGH;
+ case NPY_FR_D:
+ case NPY_FR_W:
+ len += 3; /* "-##" */
+ PD_FALLTHROUGH;
+ case NPY_FR_M:
+ len += 3; /* "-##" */
+ PD_FALLTHROUGH;
+ case NPY_FR_Y:
+ len += 21; /* 64-bit year */
+ break;
+ default:
+ len += 3; /* handle the now defunct NPY_FR_B */
+ break;
+ }
- if (base >= NPY_FR_h) {
- if (local) {
- len += 5; /* "+####" or "-####" */
- } else {
- len += 1; /* "Z" */
- }
+ if (base >= NPY_FR_h) {
+ if (local) {
+ len += 5; /* "+####" or "-####" */
+ } else {
+ len += 1; /* "Z" */
}
+ }
- len += 1; /* NULL terminator */
+ len += 1; /* NULL terminator */
- return len;
+ return len;
}
-
/*
* Converts an npy_datetimestruct to an (almost) ISO 8601
* NULL-terminated string using timezone Z (UTC). If the string fits in
@@ -812,21 +835,21 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
* Returns 0 on success, -1 on failure (for example if the output
* string was too short).
*/
-int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
+int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen,
int utc, NPY_DATETIMEUNIT base) {
- char *substr = outstr;
- int sublen = outlen;
- int tmplen;
-
- /*
- * Print weeks with the same precision as days.
- *
- * TODO: Could print weeks with YYYY-Www format if the week
- * epoch is a Monday.
- */
- if (base == NPY_FR_W) {
- base = NPY_FR_D;
- }
+ char *substr = outstr;
+ size_t sublen = outlen;
+ int tmplen;
+
+ /*
+ * Print weeks with the same precision as days.
+ *
+ * TODO: Could print weeks with YYYY-Www format if the week
+ * epoch is a Monday.
+ */
+ if (base == NPY_FR_W) {
+ base = NPY_FR_D;
+ }
/* YEAR */
/*
@@ -835,314 +858,309 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
* to have data all the way to the end of the buffer.
*/
#ifdef _WIN32
- tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
+ tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
#else
- tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
-#endif // _WIN32
- /* If it ran out of space or there isn't space for the NULL terminator */
- if (tmplen < 0 || tmplen > sublen) {
- goto string_too_short;
- }
- substr += tmplen;
- sublen -= tmplen;
+ tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
+#endif // _WIN32
+ /* If it ran out of space or there isn't space for the NULL terminator */
+ if (tmplen < 0 || (size_t)tmplen > sublen) {
+ goto string_too_short;
+ }
+ substr += tmplen;
+ sublen -= tmplen;
- /* Stop if the unit is years */
- if (base == NPY_FR_Y) {
- if (sublen > 0) {
- *substr = '\0';
- }
- return 0;
+ /* Stop if the unit is years */
+ if (base == NPY_FR_Y) {
+ if (sublen > 0) {
+ *substr = '\0';
}
+ return 0;
+ }
- /* MONTH */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = '-';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->month / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->month % 10) + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is months */
- if (base == NPY_FR_M) {
- if (sublen > 0) {
- *substr = '\0';
- }
- return 0;
- }
+ /* MONTH */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = '-';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->month / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->month % 10) + '0');
+ substr += 3;
+ sublen -= 3;
- /* DAY */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = '-';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->day / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->day % 10) + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is days */
- if (base == NPY_FR_D) {
- if (sublen > 0) {
- *substr = '\0';
- }
- return 0;
+ /* Stop if the unit is months */
+ if (base == NPY_FR_M) {
+ if (sublen > 0) {
+ *substr = '\0';
}
+ return 0;
+ }
- /* HOUR */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = 'T';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->hour / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->hour % 10) + '0');
- substr += 3;
- sublen -= 3;
+ /* DAY */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = '-';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->day / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->day % 10) + '0');
+ substr += 3;
+ sublen -= 3;
- /* Stop if the unit is hours */
- if (base == NPY_FR_h) {
- goto add_time_zone;
+ /* Stop if the unit is days */
+ if (base == NPY_FR_D) {
+ if (sublen > 0) {
+ *substr = '\0';
}
+ return 0;
+ }
- /* MINUTE */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = ':';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->min / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->min % 10) + '0');
- substr += 3;
- sublen -= 3;
+ /* HOUR */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = 'T';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->hour / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->hour % 10) + '0');
+ substr += 3;
+ sublen -= 3;
- /* Stop if the unit is minutes */
- if (base == NPY_FR_m) {
- goto add_time_zone;
- }
+ /* Stop if the unit is hours */
+ if (base == NPY_FR_h) {
+ goto add_time_zone;
+ }
- /* SECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = ':';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->sec / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->sec % 10) + '0');
- substr += 3;
- sublen -= 3;
+ /* MINUTE */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = ':';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->min / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->min % 10) + '0');
+ substr += 3;
+ sublen -= 3;
- /* Stop if the unit is seconds */
- if (base == NPY_FR_s) {
- goto add_time_zone;
- }
+ /* Stop if the unit is minutes */
+ if (base == NPY_FR_m) {
+ goto add_time_zone;
+ }
- /* MILLISECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = '.';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->us / 100000) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->us / 10000) % 10 + '0');
- if (sublen < 4) {
- goto string_too_short;
- }
- substr[3] = (char)((dts->us / 1000) % 10 + '0');
- substr += 4;
- sublen -= 4;
+ /* SECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = ':';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->sec / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->sec % 10) + '0');
+ substr += 3;
+ sublen -= 3;
- /* Stop if the unit is milliseconds */
- if (base == NPY_FR_ms) {
- goto add_time_zone;
- }
+ /* Stop if the unit is seconds */
+ if (base == NPY_FR_s) {
+ goto add_time_zone;
+ }
- /* MICROSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->us / 100) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->us / 10) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)(dts->us % 10 + '0');
- substr += 3;
- sublen -= 3;
+ /* MILLISECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = '.';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->us / 100000) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->us / 10000) % 10 + '0');
+ if (sublen < 4) {
+ goto string_too_short;
+ }
+ substr[3] = (char)((dts->us / 1000) % 10 + '0');
+ substr += 4;
+ sublen -= 4;
- /* Stop if the unit is microseconds */
- if (base == NPY_FR_us) {
- goto add_time_zone;
- }
+ /* Stop if the unit is milliseconds */
+ if (base == NPY_FR_ms) {
+ goto add_time_zone;
+ }
- /* NANOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->ps / 100000) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->ps / 10000) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->ps / 1000) % 10 + '0');
- substr += 3;
- sublen -= 3;
+ /* MICROSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->us / 100) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->us / 10) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)(dts->us % 10 + '0');
+ substr += 3;
+ sublen -= 3;
- /* Stop if the unit is nanoseconds */
- if (base == NPY_FR_ns) {
- goto add_time_zone;
- }
+ /* Stop if the unit is microseconds */
+ if (base == NPY_FR_us) {
+ goto add_time_zone;
+ }
- /* PICOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->ps / 100) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->ps / 10) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)(dts->ps % 10 + '0');
- substr += 3;
- sublen -= 3;
+ /* NANOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->ps / 100000) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->ps / 10000) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->ps / 1000) % 10 + '0');
+ substr += 3;
+ sublen -= 3;
- /* Stop if the unit is picoseconds */
- if (base == NPY_FR_ps) {
- goto add_time_zone;
- }
+ /* Stop if the unit is nanoseconds */
+ if (base == NPY_FR_ns) {
+ goto add_time_zone;
+ }
- /* FEMTOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->as / 100000) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->as / 10000) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->as / 1000) % 10 + '0');
- substr += 3;
- sublen -= 3;
+ /* PICOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->ps / 100) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->ps / 10) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)(dts->ps % 10 + '0');
+ substr += 3;
+ sublen -= 3;
- /* Stop if the unit is femtoseconds */
- if (base == NPY_FR_fs) {
- goto add_time_zone;
- }
+ /* Stop if the unit is picoseconds */
+ if (base == NPY_FR_ps) {
+ goto add_time_zone;
+ }
- /* ATTOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->as / 100) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->as / 10) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)(dts->as % 10 + '0');
- substr += 3;
- sublen -= 3;
+ /* FEMTOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->as / 100000) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->as / 10000) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->as / 1000) % 10 + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is femtoseconds */
+ if (base == NPY_FR_fs) {
+ goto add_time_zone;
+ }
+
+ /* ATTOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->as / 100) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->as / 10) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)(dts->as % 10 + '0');
+ substr += 3;
+ sublen -= 3;
add_time_zone:
- /* UTC "Zulu" time */
- if (utc) {
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = 'Z';
- substr += 1;
- sublen -= 1;
- }
- /* Add a NULL terminator, and return */
- if (sublen > 0) {
- substr[0] = '\0';
+ /* UTC "Zulu" time */
+ if (utc) {
+ if (sublen < 1) {
+ goto string_too_short;
}
+ substr[0] = 'Z';
+ substr += 1;
+ sublen -= 1;
+ }
+ /* Add a NULL terminator, and return */
+ if (sublen > 0) {
+ substr[0] = '\0';
+ }
- return 0;
+ return 0;
string_too_short:
- PyErr_Format(PyExc_RuntimeError,
- "The string provided for NumPy ISO datetime formatting "
- "was too short, with length %d",
- outlen);
- return -1;
+ PyErr_Format(PyExc_RuntimeError,
+ "The string provided for NumPy ISO datetime formatting "
+ "was too short, with length %d",
+ outlen);
+ return -1;
}
-
-int make_iso_8601_timedelta(pandas_timedeltastruct *tds,
- char *outstr, size_t *outlen) {
+int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr,
+ size_t *outlen) {
*outlen = 0;
- *outlen += snprintf(outstr, 60, // NOLINT
- "P%" NPY_INT64_FMT
- "DT%" NPY_INT32_FMT
- "H%" NPY_INT32_FMT
- "M%" NPY_INT32_FMT,
- tds->days, tds->hrs, tds->min, tds->sec);
+ *outlen += snprintf(outstr, 60, // NOLINT
+ "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT
+ "M%" NPY_INT32_FMT,
+ tds->days, tds->hrs, tds->min, tds->sec);
outstr += *outlen;
if (tds->ns != 0) {
- *outlen += snprintf(outstr, 12, // NOLINT
- ".%03" NPY_INT32_FMT
- "%03" NPY_INT32_FMT
- "%03" NPY_INT32_FMT
- "S", tds->ms, tds->us, tds->ns);
+ *outlen += snprintf(outstr, 12, // NOLINT
+ ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT
+ "%03" NPY_INT32_FMT "S",
+ tds->ms, tds->us, tds->ns);
} else if (tds->us != 0) {
- *outlen += snprintf(outstr, 9, // NOLINT
- ".%03" NPY_INT32_FMT
- "%03" NPY_INT32_FMT
- "S", tds->ms, tds->us);
+ *outlen += snprintf(outstr, 9, // NOLINT
+ ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms,
+ tds->us);
} else if (tds->ms != 0) {
- *outlen += snprintf(outstr, 6, // NOLINT
+ *outlen += snprintf(outstr, 6, // NOLINT
".%03" NPY_INT32_FMT "S", tds->ms);
} else {
- *outlen += snprintf(outstr, 2, // NOLINT
+ *outlen += snprintf(outstr, 2, // NOLINT
"%s", "S");
}
diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c
index 9ec12cb242728..bf389b4dce1d0 100644
--- a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c
+++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c
@@ -38,7 +38,9 @@ Numeric decoder derived from TCL library
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
-#include
+// Licence at LICENSES/ULTRAJSON_LICENSE
+
+#include "pandas/vendored/ujson/lib/ultrajson.h"
#include
#include
#include
@@ -46,7 +48,6 @@ Numeric decoder derived from TCL library
#include
#include
#include
-#include "pandas/vendored/ujson/lib/ultrajson.h"
#ifndef TRUE
#define TRUE 1
@@ -57,15 +58,15 @@ Numeric decoder derived from TCL library
#endif
struct DecoderState {
- char *start;
- char *end;
- wchar_t *escStart;
- wchar_t *escEnd;
- int escHeap;
- int lastType;
- JSUINT32 objDepth;
- void *prv;
- JSONObjectDecoder *dec;
+ char *start;
+ char *end;
+ wchar_t *escStart;
+ wchar_t *escEnd;
+ int escHeap;
+ int lastType;
+ JSUINT32 objDepth;
+ void *prv;
+ JSONObjectDecoder *dec;
};
JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds);
@@ -73,349 +74,372 @@ typedef JSOBJ (*PFN_DECODER)(struct DecoderState *ds);
static JSOBJ SetError(struct DecoderState *ds, int offset,
const char *message) {
- ds->dec->errorOffset = ds->start + offset;
- ds->dec->errorStr = (char *)message;
- return NULL;
+ ds->dec->errorOffset = ds->start + offset;
+ ds->dec->errorStr = (char *)message;
+ return NULL;
}
double createDouble(double intNeg, double intValue, double frcValue,
int frcDecimalCount) {
- static const double g_pow10[] = {1.0,
- 0.1,
- 0.01,
- 0.001,
- 0.0001,
- 0.00001,
- 0.000001,
- 0.0000001,
- 0.00000001,
- 0.000000001,
- 0.0000000001,
- 0.00000000001,
- 0.000000000001,
- 0.0000000000001,
- 0.00000000000001,
- 0.000000000000001};
- return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg;
+ static const double g_pow10[] = {1.0,
+ 0.1,
+ 0.01,
+ 0.001,
+ 0.0001,
+ 0.00001,
+ 0.000001,
+ 0.0000001,
+ 0.00000001,
+ 0.000000001,
+ 0.0000000001,
+ 0.00000000001,
+ 0.000000000001,
+ 0.0000000000001,
+ 0.00000000000001,
+ 0.000000000000001};
+ return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg;
}
JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) {
- char *end;
- double value;
- errno = 0;
+ char *end;
+ double value;
+ errno = 0;
- value = strtod(ds->start, &end);
+ value = strtod(ds->start, &end);
- if (errno == ERANGE) {
- return SetError(ds, -1, "Range error when decoding numeric as double");
- }
+ if (errno == ERANGE) {
+ return SetError(ds, -1, "Range error when decoding numeric as double");
+ }
- ds->start = end;
- return ds->dec->newDouble(ds->prv, value);
+ ds->start = end;
+ return ds->dec->newDouble(ds->prv, value);
}
JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
- int intNeg = 1;
- JSUINT64 intValue;
- JSUINT64 prevIntValue;
- int chr;
- int decimalCount = 0;
- double frcValue = 0.0;
- double expNeg;
- double expValue;
- char *offset = ds->start;
-
- JSUINT64 overflowLimit = LLONG_MAX;
-
+ int intNeg = 1;
+ JSUINT64 intValue;
+ JSUINT64 prevIntValue;
+ int chr;
+ int decimalCount = 0;
+ double frcValue = 0.0;
+ double expNeg;
+ double expValue;
+ char *offset = ds->start;
+
+ JSUINT64 overflowLimit = LLONG_MAX;
+
+ if (*(offset) == 'I') {
+ goto DECODE_INF;
+ } else if (*(offset) == 'N') {
+ goto DECODE_NAN;
+ } else if (*(offset) == '-') {
+ offset++;
+ intNeg = -1;
+ overflowLimit = LLONG_MIN;
if (*(offset) == 'I') {
goto DECODE_INF;
- } else if (*(offset) == 'N') {
- goto DECODE_NAN;
- } else if (*(offset) == '-') {
- offset++;
- intNeg = -1;
- overflowLimit = LLONG_MIN;
- if (*(offset) == 'I') {
- goto DECODE_INF;
- }
+ }
+ }
+
+ // Scan integer part
+ intValue = 0;
+
+ while (1) {
+ chr = (int)(unsigned char)*(offset);
+
+ switch (chr) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ // PERF: Don't do 64-bit arithmetic here unless we have to
+ prevIntValue = intValue;
+ intValue = intValue * 10ULL + (JSLONG)(chr - 48);
+
+ if (intNeg == 1 && prevIntValue > intValue) {
+ return SetError(ds, -1, "Value is too big!");
+ } else if (intNeg == -1 && intValue > overflowLimit) {
+ return SetError(ds, -1,
+ overflowLimit == LLONG_MAX ? "Value is too big!"
+ : "Value is too small");
+ }
+
+ offset++;
+ break;
+ }
+ case '.': {
+ offset++;
+ goto DECODE_FRACTION;
+ break;
+ }
+ case 'e':
+ case 'E': {
+ offset++;
+ goto DECODE_EXPONENT;
+ break;
}
- // Scan integer part
- intValue = 0;
-
- while (1) {
- chr = (int)(unsigned char)*(offset);
-
- switch (chr) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9': {
- // PERF: Don't do 64-bit arithmetic here unless we have to
- prevIntValue = intValue;
- intValue = intValue * 10ULL + (JSLONG) (chr - 48);
-
- if (intNeg == 1 && prevIntValue > intValue) {
- return SetError(ds, -1, "Value is too big!");
- } else if (intNeg == -1 && intValue > overflowLimit) {
- return SetError(ds, -1, overflowLimit == LLONG_MAX ?
- "Value is too big!" : "Value is too small");
- }
-
- offset++;
- break;
- }
- case '.': {
- offset++;
- goto DECODE_FRACTION;
- break;
- }
- case 'e':
- case 'E': {
- offset++;
- goto DECODE_EXPONENT;
- break;
- }
-
- default: {
- goto BREAK_INT_LOOP;
- break;
- }
- }
+ default: {
+ goto BREAK_INT_LOOP;
+ break;
}
+ }
+ }
BREAK_INT_LOOP:
- ds->lastType = JT_INT;
- ds->start = offset;
+ ds->lastType = JT_INT;
+ ds->start = offset;
- if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0)
- return ds->dec->newUnsignedLong(ds->prv, intValue);
- else if ((intValue >> 31))
- return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
- else
- return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
+ if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0)
+ return ds->dec->newUnsignedLong(ds->prv, intValue);
+ else if ((intValue >> 31))
+ return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
+ else
+ return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
DECODE_FRACTION:
- if (ds->dec->preciseFloat) {
- return decodePreciseFloat(ds);
+ if (ds->dec->preciseFloat) {
+ return decodePreciseFloat(ds);
+ }
+
+ // Scan fraction part
+ frcValue = 0.0;
+ for (;;) {
+ chr = (int)(unsigned char)*(offset);
+
+ switch (chr) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) {
+ frcValue = frcValue * 10.0 + (double)(chr - 48);
+ decimalCount++;
+ }
+ offset++;
+ break;
}
-
- // Scan fraction part
- frcValue = 0.0;
- for (;;) {
- chr = (int)(unsigned char)*(offset);
-
- switch (chr) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9': {
- if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) {
- frcValue = frcValue * 10.0 + (double)(chr - 48);
- decimalCount++;
- }
- offset++;
- break;
- }
- case 'e':
- case 'E': {
- offset++;
- goto DECODE_EXPONENT;
- break;
- }
- default: { goto BREAK_FRC_LOOP; }
- }
+ case 'e':
+ case 'E': {
+ offset++;
+ goto DECODE_EXPONENT;
+ break;
}
+ default: {
+ goto BREAK_FRC_LOOP;
+ }
+ }
+ }
BREAK_FRC_LOOP:
- // FIXME: Check for arithmetic overflow here
- ds->lastType = JT_DOUBLE;
- ds->start = offset;
- return ds->dec->newDouble(
- ds->prv,
- createDouble((double)intNeg, (double)intValue, frcValue, decimalCount));
+ // FIXME: Check for arithmetic overflow here
+ ds->lastType = JT_DOUBLE;
+ ds->start = offset;
+ return ds->dec->newDouble(
+ ds->prv,
+ createDouble((double)intNeg, (double)intValue, frcValue, decimalCount));
DECODE_EXPONENT:
- if (ds->dec->preciseFloat) {
- return decodePreciseFloat(ds);
- }
+ if (ds->dec->preciseFloat) {
+ return decodePreciseFloat(ds);
+ }
- expNeg = 1.0;
+ expNeg = 1.0;
- if (*(offset) == '-') {
- expNeg = -1.0;
- offset++;
- } else if (*(offset) == '+') {
- expNeg = +1.0;
- offset++;
+ if (*(offset) == '-') {
+ expNeg = -1.0;
+ offset++;
+ } else if (*(offset) == '+') {
+ expNeg = +1.0;
+ offset++;
+ }
+
+ expValue = 0.0;
+
+ for (;;) {
+ chr = (int)(unsigned char)*(offset);
+
+ switch (chr) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ expValue = expValue * 10.0 + (double)(chr - 48);
+ offset++;
+ break;
}
-
- expValue = 0.0;
-
- for (;;) {
- chr = (int)(unsigned char)*(offset);
-
- switch (chr) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9': {
- expValue = expValue * 10.0 + (double)(chr - 48);
- offset++;
- break;
- }
- default: { goto BREAK_EXP_LOOP; }
- }
+ default: {
+ goto BREAK_EXP_LOOP;
}
+ }
+ }
DECODE_NAN:
- offset++;
- if (*(offset++) != 'a') goto SET_NAN_ERROR;
- if (*(offset++) != 'N') goto SET_NAN_ERROR;
+ offset++;
+ if (*(offset++) != 'a')
+ goto SET_NAN_ERROR;
+ if (*(offset++) != 'N')
+ goto SET_NAN_ERROR;
- ds->lastType = JT_NULL;
- ds->start = offset;
- return ds->dec->newNull(ds->prv);
+ ds->lastType = JT_NULL;
+ ds->start = offset;
+ return ds->dec->newNull(ds->prv);
SET_NAN_ERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'NaN'");
+ return SetError(ds, -1, "Unexpected character found when decoding 'NaN'");
DECODE_INF:
- offset++;
- if (*(offset++) != 'n') goto SET_INF_ERROR;
- if (*(offset++) != 'f') goto SET_INF_ERROR;
- if (*(offset++) != 'i') goto SET_INF_ERROR;
- if (*(offset++) != 'n') goto SET_INF_ERROR;
- if (*(offset++) != 'i') goto SET_INF_ERROR;
- if (*(offset++) != 't') goto SET_INF_ERROR;
- if (*(offset++) != 'y') goto SET_INF_ERROR;
-
- ds->start = offset;
-
- if (intNeg == 1) {
- ds->lastType = JT_POS_INF;
- return ds->dec->newPosInf(ds->prv);
- } else {
- ds->lastType = JT_NEG_INF;
- return ds->dec->newNegInf(ds->prv);
- }
+ offset++;
+ if (*(offset++) != 'n')
+ goto SET_INF_ERROR;
+ if (*(offset++) != 'f')
+ goto SET_INF_ERROR;
+ if (*(offset++) != 'i')
+ goto SET_INF_ERROR;
+ if (*(offset++) != 'n')
+ goto SET_INF_ERROR;
+ if (*(offset++) != 'i')
+ goto SET_INF_ERROR;
+ if (*(offset++) != 't')
+ goto SET_INF_ERROR;
+ if (*(offset++) != 'y')
+ goto SET_INF_ERROR;
+
+ ds->start = offset;
+
+ if (intNeg == 1) {
+ ds->lastType = JT_POS_INF;
+ return ds->dec->newPosInf(ds->prv);
+ } else {
+ ds->lastType = JT_NEG_INF;
+ return ds->dec->newNegInf(ds->prv);
+ }
SET_INF_ERROR:
- if (intNeg == 1) {
- const char *msg = "Unexpected character found when decoding 'Infinity'";
- return SetError(ds, -1, msg);
- } else {
- const char *msg = "Unexpected character found when decoding '-Infinity'";
- return SetError(ds, -1, msg);
- }
-
+ if (intNeg == 1) {
+ const char *msg = "Unexpected character found when decoding 'Infinity'";
+ return SetError(ds, -1, msg);
+ } else {
+ const char *msg = "Unexpected character found when decoding '-Infinity'";
+ return SetError(ds, -1, msg);
+ }
BREAK_EXP_LOOP:
- // FIXME: Check for arithmetic overflow here
- ds->lastType = JT_DOUBLE;
- ds->start = offset;
- return ds->dec->newDouble(
- ds->prv,
- createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) *
- pow(10.0, expValue * expNeg));
+ // FIXME: Check for arithmetic overflow here
+ ds->lastType = JT_DOUBLE;
+ ds->start = offset;
+ return ds->dec->newDouble(
+ ds->prv,
+ createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) *
+ pow(10.0, expValue * expNeg));
}
JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) {
- char *offset = ds->start;
- offset++;
+ char *offset = ds->start;
+ offset++;
- if (*(offset++) != 'r') goto SETERROR;
- if (*(offset++) != 'u') goto SETERROR;
- if (*(offset++) != 'e') goto SETERROR;
+ if (*(offset++) != 'r')
+ goto SETERROR;
+ if (*(offset++) != 'u')
+ goto SETERROR;
+ if (*(offset++) != 'e')
+ goto SETERROR;
- ds->lastType = JT_TRUE;
- ds->start = offset;
- return ds->dec->newTrue(ds->prv);
+ ds->lastType = JT_TRUE;
+ ds->start = offset;
+ return ds->dec->newTrue(ds->prv);
SETERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'true'");
+ return SetError(ds, -1, "Unexpected character found when decoding 'true'");
}
JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) {
- char *offset = ds->start;
- offset++;
-
- if (*(offset++) != 'a') goto SETERROR;
- if (*(offset++) != 'l') goto SETERROR;
- if (*(offset++) != 's') goto SETERROR;
- if (*(offset++) != 'e') goto SETERROR;
-
- ds->lastType = JT_FALSE;
- ds->start = offset;
- return ds->dec->newFalse(ds->prv);
+ char *offset = ds->start;
+ offset++;
+
+ if (*(offset++) != 'a')
+ goto SETERROR;
+ if (*(offset++) != 'l')
+ goto SETERROR;
+ if (*(offset++) != 's')
+ goto SETERROR;
+ if (*(offset++) != 'e')
+ goto SETERROR;
+
+ ds->lastType = JT_FALSE;
+ ds->start = offset;
+ return ds->dec->newFalse(ds->prv);
SETERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'false'");
+ return SetError(ds, -1, "Unexpected character found when decoding 'false'");
}
JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) {
- char *offset = ds->start;
- offset++;
+ char *offset = ds->start;
+ offset++;
- if (*(offset++) != 'u') goto SETERROR;
- if (*(offset++) != 'l') goto SETERROR;
- if (*(offset++) != 'l') goto SETERROR;
+ if (*(offset++) != 'u')
+ goto SETERROR;
+ if (*(offset++) != 'l')
+ goto SETERROR;
+ if (*(offset++) != 'l')
+ goto SETERROR;
- ds->lastType = JT_NULL;
- ds->start = offset;
- return ds->dec->newNull(ds->prv);
+ ds->lastType = JT_NULL;
+ ds->start = offset;
+ return ds->dec->newNull(ds->prv);
SETERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'null'");
+ return SetError(ds, -1, "Unexpected character found when decoding 'null'");
}
void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) {
- char *offset;
-
- for (offset = ds->start; (ds->end - offset) > 0; offset++) {
- switch (*offset) {
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- break;
-
- default:
- ds->start = offset;
- return;
- }
+ char *offset;
+
+ for (offset = ds->start; (ds->end - offset) > 0; offset++) {
+ switch (*offset) {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ break;
+
+ default:
+ ds->start = offset;
+ return;
}
+ }
- if (offset == ds->end) {
- ds->start = ds->end;
- }
+ if (offset == ds->end) {
+ ds->start = ds->end;
+ }
}
enum DECODESTRINGSTATE {
- DS_ISNULL = 0x32,
- DS_ISQUOTE,
- DS_ISESCAPE,
- DS_UTFLENERROR,
+ DS_ISNULL = 0x32,
+ DS_ISQUOTE,
+ DS_ISESCAPE,
+ DS_UTFLENERROR,
};
static const JSUINT8 g_decoderLookup[256] = {
@@ -678,531 +702,520 @@ static const JSUINT8 g_decoderLookup[256] = {
};
JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) {
- JSUTF16 sur[2] = {0};
- int iSur = 0;
- int index;
- wchar_t *escOffset;
- wchar_t *escStart;
- size_t escLen = (ds->escEnd - ds->escStart);
- JSUINT8 *inputOffset;
- JSUINT8 oct;
- JSUTF32 ucs;
- ds->lastType = JT_INVALID;
- ds->start++;
-
- if ((size_t)(ds->end - ds->start) > escLen) {
- size_t newSize = (ds->end - ds->start);
-
- if (ds->escHeap) {
- if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
- return SetError(ds, -1, "Could not reserve memory block");
- }
- escStart = (wchar_t *)ds->dec->realloc(ds->escStart,
- newSize * sizeof(wchar_t));
- if (!escStart) {
- ds->dec->free(ds->escStart);
- return SetError(ds, -1, "Could not reserve memory block");
- }
- ds->escStart = escStart;
- } else {
- wchar_t *oldStart = ds->escStart;
- if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
- return SetError(ds, -1, "Could not reserve memory block");
- }
- ds->escStart =
- (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t));
- if (!ds->escStart) {
- return SetError(ds, -1, "Could not reserve memory block");
- }
- ds->escHeap = 1;
- memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
- }
-
- ds->escEnd = ds->escStart + newSize;
+ JSUTF16 sur[2] = {0};
+ int iSur = 0;
+ int index;
+ wchar_t *escOffset;
+ wchar_t *escStart;
+ size_t escLen = (ds->escEnd - ds->escStart);
+ JSUINT8 *inputOffset;
+ JSUINT8 oct;
+ JSUTF32 ucs;
+ ds->lastType = JT_INVALID;
+ ds->start++;
+
+ if ((size_t)(ds->end - ds->start) > escLen) {
+ size_t newSize = (ds->end - ds->start);
+
+ if (ds->escHeap) {
+ if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ escStart =
+ (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t));
+ if (!escStart) {
+ ds->dec->free(ds->escStart);
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ ds->escStart = escStart;
+ } else {
+ wchar_t *oldStart = ds->escStart;
+ if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ ds->escStart = (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t));
+ if (!ds->escStart) {
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ ds->escHeap = 1;
+ memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
}
- escOffset = ds->escStart;
- inputOffset = (JSUINT8 *)ds->start;
-
- for (;;) {
- switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) {
- case DS_ISNULL: {
- return SetError(ds, -1,
- "Unmatched ''\"' when when decoding 'string'");
- }
- case DS_ISQUOTE: {
- ds->lastType = JT_UTF8;
- inputOffset++;
- ds->start += ((char *)inputOffset - (ds->start));
- return ds->dec->newString(ds->prv, ds->escStart, escOffset);
- }
- case DS_UTFLENERROR: {
- return SetError(
- ds, -1,
- "Invalid UTF-8 sequence length when decoding 'string'");
- }
- case DS_ISESCAPE:
- inputOffset++;
- switch (*inputOffset) {
- case '\\':
- *(escOffset++) = L'\\';
- inputOffset++;
- continue;
- case '\"':
- *(escOffset++) = L'\"';
- inputOffset++;
- continue;
- case '/':
- *(escOffset++) = L'/';
- inputOffset++;
- continue;
- case 'b':
- *(escOffset++) = L'\b';
- inputOffset++;
- continue;
- case 'f':
- *(escOffset++) = L'\f';
- inputOffset++;
- continue;
- case 'n':
- *(escOffset++) = L'\n';
- inputOffset++;
- continue;
- case 'r':
- *(escOffset++) = L'\r';
- inputOffset++;
- continue;
- case 't':
- *(escOffset++) = L'\t';
- inputOffset++;
- continue;
-
- case 'u': {
- int index;
- inputOffset++;
-
- for (index = 0; index < 4; index++) {
- switch (*inputOffset) {
- case '\0':
- return SetError(ds, -1,
- "Unterminated unicode "
- "escape sequence when "
- "decoding 'string'");
- default:
- return SetError(ds, -1,
- "Unexpected character in "
- "unicode escape sequence "
- "when decoding 'string'");
-
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- sur[iSur] = (sur[iSur] << 4) +
- (JSUTF16)(*inputOffset - '0');
- break;
-
- case 'a':
- case 'b':
- case 'c':
- case 'd':
- case 'e':
- case 'f':
- sur[iSur] = (sur[iSur] << 4) + 10 +
- (JSUTF16)(*inputOffset - 'a');
- break;
-
- case 'A':
- case 'B':
- case 'C':
- case 'D':
- case 'E':
- case 'F':
- sur[iSur] = (sur[iSur] << 4) + 10 +
- (JSUTF16)(*inputOffset - 'A');
- break;
- }
-
- inputOffset++;
- }
-
- if (iSur == 0) {
- if ((sur[iSur] & 0xfc00) == 0xd800) {
- // First of a surrogate pair, continue parsing
- iSur++;
- break;
- }
- (*escOffset++) = (wchar_t)sur[iSur];
- iSur = 0;
- } else {
- // Decode pair
- if ((sur[1] & 0xfc00) != 0xdc00) {
- return SetError(ds, -1,
- "Unpaired high surrogate when "
- "decoding 'string'");
- }
-#if WCHAR_MAX == 0xffff
- (*escOffset++) = (wchar_t)sur[0];
- (*escOffset++) = (wchar_t)sur[1];
-#else
- (*escOffset++) =
- (wchar_t)0x10000 +
- (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
-#endif
- iSur = 0;
- }
- break;
- }
-
- case '\0':
- return SetError(ds, -1,
- "Unterminated escape sequence when "
- "decoding 'string'");
- default:
- return SetError(ds, -1,
- "Unrecognized escape sequence when "
- "decoding 'string'");
- }
- break;
-
- case 1: {
- *(escOffset++) = (wchar_t)(*inputOffset++);
- break;
- }
-
- case 2: {
- ucs = (*inputOffset++) & 0x1f;
- ucs <<= 6;
- if (((*inputOffset) & 0x80) != 0x80) {
- return SetError(ds, -1,
- "Invalid octet in UTF-8 sequence when "
- "decoding 'string'");
- }
- ucs |= (*inputOffset++) & 0x3f;
- if (ucs < 0x80)
- return SetError(ds, -1,
- "Overlong 2 byte UTF-8 sequence detected "
- "when decoding 'string'");
- *(escOffset++) = (wchar_t)ucs;
- break;
- }
-
- case 3: {
- JSUTF32 ucs = 0;
- ucs |= (*inputOffset++) & 0x0f;
-
- for (index = 0; index < 2; index++) {
- ucs <<= 6;
- oct = (*inputOffset++);
-
- if ((oct & 0x80) != 0x80) {
- return SetError(ds, -1,
- "Invalid octet in UTF-8 sequence when "
- "decoding 'string'");
- }
-
- ucs |= oct & 0x3f;
- }
-
- if (ucs < 0x800)
- return SetError(ds, -1,
- "Overlong 3 byte UTF-8 sequence detected "
- "when encoding string");
- *(escOffset++) = (wchar_t)ucs;
- break;
- }
-
- case 4: {
- JSUTF32 ucs = 0;
- ucs |= (*inputOffset++) & 0x07;
-
- for (index = 0; index < 3; index++) {
- ucs <<= 6;
- oct = (*inputOffset++);
-
- if ((oct & 0x80) != 0x80) {
- return SetError(ds, -1,
- "Invalid octet in UTF-8 sequence when "
- "decoding 'string'");
- }
-
- ucs |= oct & 0x3f;
- }
-
- if (ucs < 0x10000)
- return SetError(ds, -1,
- "Overlong 4 byte UTF-8 sequence detected "
- "when decoding 'string'");
+ ds->escEnd = ds->escStart + newSize;
+ }
+
+ escOffset = ds->escStart;
+ inputOffset = (JSUINT8 *)ds->start;
+
+ for (;;) {
+ switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) {
+ case DS_ISNULL: {
+ return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'");
+ }
+ case DS_ISQUOTE: {
+ ds->lastType = JT_UTF8;
+ inputOffset++;
+ ds->start += ((char *)inputOffset - (ds->start));
+ return ds->dec->newString(ds->prv, ds->escStart, escOffset);
+ }
+ case DS_UTFLENERROR: {
+ return SetError(ds, -1,
+ "Invalid UTF-8 sequence length when decoding 'string'");
+ }
+ case DS_ISESCAPE:
+ inputOffset++;
+ switch (*inputOffset) {
+ case '\\':
+ *(escOffset++) = L'\\';
+ inputOffset++;
+ continue;
+ case '\"':
+ *(escOffset++) = L'\"';
+ inputOffset++;
+ continue;
+ case '/':
+ *(escOffset++) = L'/';
+ inputOffset++;
+ continue;
+ case 'b':
+ *(escOffset++) = L'\b';
+ inputOffset++;
+ continue;
+ case 'f':
+ *(escOffset++) = L'\f';
+ inputOffset++;
+ continue;
+ case 'n':
+ *(escOffset++) = L'\n';
+ inputOffset++;
+ continue;
+ case 'r':
+ *(escOffset++) = L'\r';
+ inputOffset++;
+ continue;
+ case 't':
+ *(escOffset++) = L'\t';
+ inputOffset++;
+ continue;
+
+ case 'u': {
+ int index;
+ inputOffset++;
+
+ for (index = 0; index < 4; index++) {
+ switch (*inputOffset) {
+ case '\0':
+ return SetError(ds, -1,
+ "Unterminated unicode "
+ "escape sequence when "
+ "decoding 'string'");
+ default:
+ return SetError(ds, -1,
+ "Unexpected character in "
+ "unicode escape sequence "
+ "when decoding 'string'");
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ sur[iSur] = (sur[iSur] << 4) + (JSUTF16)(*inputOffset - '0');
+ break;
+
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'a');
+ break;
+
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'A');
+ break;
+ }
+
+ inputOffset++;
+ }
+ if (iSur == 0) {
+ if ((sur[iSur] & 0xfc00) == 0xd800) {
+ // First of a surrogate pair, continue parsing
+ iSur++;
+ break;
+ }
+ (*escOffset++) = (wchar_t)sur[iSur];
+ iSur = 0;
+ } else {
+ // Decode pair
+ if ((sur[1] & 0xfc00) != 0xdc00) {
+ return SetError(ds, -1,
+ "Unpaired high surrogate when "
+ "decoding 'string'");
+ }
#if WCHAR_MAX == 0xffff
- if (ucs >= 0x10000) {
- ucs -= 0x10000;
- *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800;
- *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00;
- } else {
- *(escOffset++) = (wchar_t)ucs;
- }
+ (*escOffset++) = (wchar_t)sur[0];
+ (*escOffset++) = (wchar_t)sur[1];
#else
- *(escOffset++) = (wchar_t)ucs;
+ (*escOffset++) = (wchar_t)0x10000 +
+ (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
#endif
- break;
- }
+ iSur = 0;
}
+ break;
+ }
+
+ case '\0':
+ return SetError(ds, -1,
+ "Unterminated escape sequence when "
+ "decoding 'string'");
+ default:
+ return SetError(ds, -1,
+ "Unrecognized escape sequence when "
+ "decoding 'string'");
+ }
+ break;
+
+ case 1: {
+ *(escOffset++) = (wchar_t)(*inputOffset++);
+ break;
}
-}
-JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) {
- JSOBJ itemValue;
- JSOBJ newObj;
- int len;
- ds->objDepth++;
- if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
- return SetError(ds, -1, "Reached object decoding depth limit");
+ case 2: {
+ ucs = (*inputOffset++) & 0x1f;
+ ucs <<= 6;
+ if (((*inputOffset) & 0x80) != 0x80) {
+ return SetError(ds, -1,
+ "Invalid octet in UTF-8 sequence when "
+ "decoding 'string'");
+ }
+ ucs |= (*inputOffset++) & 0x3f;
+ if (ucs < 0x80)
+ return SetError(ds, -1,
+ "Overlong 2 byte UTF-8 sequence detected "
+ "when decoding 'string'");
+ *(escOffset++) = (wchar_t)ucs;
+ break;
}
- newObj = ds->dec->newArray(ds->prv, ds->dec);
- len = 0;
+ case 3: {
+ JSUTF32 ucs = 0;
+ ucs |= (*inputOffset++) & 0x0f;
- ds->lastType = JT_INVALID;
- ds->start++;
-
- for (;;) {
- SkipWhitespace(ds);
-
- if ((*ds->start) == ']') {
- ds->objDepth--;
- if (len == 0) {
- ds->start++;
- return ds->dec->endArray(ds->prv, newObj);
- }
-
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return SetError(
- ds, -1,
- "Unexpected character found when decoding array value (1)");
+ for (index = 0; index < 2; index++) {
+ ucs <<= 6;
+ oct = (*inputOffset++);
+
+ if ((oct & 0x80) != 0x80) {
+ return SetError(ds, -1,
+ "Invalid octet in UTF-8 sequence when "
+ "decoding 'string'");
}
- itemValue = decode_any(ds);
+ ucs |= oct & 0x3f;
+ }
- if (itemValue == NULL) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return NULL;
- }
+ if (ucs < 0x800)
+ return SetError(ds, -1,
+ "Overlong 3 byte UTF-8 sequence detected "
+ "when encoding string");
+ *(escOffset++) = (wchar_t)ucs;
+ break;
+ }
- if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return NULL;
- }
+ case 4: {
+ JSUTF32 ucs = 0;
+ ucs |= (*inputOffset++) & 0x07;
+
+ for (index = 0; index < 3; index++) {
+ ucs <<= 6;
+ oct = (*inputOffset++);
- SkipWhitespace(ds);
-
- switch (*(ds->start++)) {
- case ']': {
- ds->objDepth--;
- return ds->dec->endArray(ds->prv, newObj);
- }
- case ',':
- break;
-
- default:
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return SetError(
- ds, -1,
- "Unexpected character found when decoding array value (2)");
+ if ((oct & 0x80) != 0x80) {
+ return SetError(ds, -1,
+ "Invalid octet in UTF-8 sequence when "
+ "decoding 'string'");
}
- len++;
+ ucs |= oct & 0x3f;
+ }
+
+ if (ucs < 0x10000)
+ return SetError(ds, -1,
+ "Overlong 4 byte UTF-8 sequence detected "
+ "when decoding 'string'");
+
+#if WCHAR_MAX == 0xffff
+ if (ucs >= 0x10000) {
+ ucs -= 0x10000;
+ *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800;
+ *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00;
+ } else {
+ *(escOffset++) = (wchar_t)ucs;
+ }
+#else
+ *(escOffset++) = (wchar_t)ucs;
+#endif
+ break;
}
+ }
+ }
}
-JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) {
- JSOBJ itemName;
- JSOBJ itemValue;
- JSOBJ newObj;
+JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) {
+ JSOBJ itemValue;
+ JSOBJ newObj;
+ int len;
+ ds->objDepth++;
+ if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
+ return SetError(ds, -1, "Reached object decoding depth limit");
+ }
+
+ newObj = ds->dec->newArray(ds->prv, ds->dec);
+ len = 0;
+
+ ds->lastType = JT_INVALID;
+ ds->start++;
+
+ for (;;) {
+ SkipWhitespace(ds);
+
+ if ((*ds->start) == ']') {
+ ds->objDepth--;
+ if (len == 0) {
+ ds->start++;
+ return ds->dec->endArray(ds->prv, newObj);
+ }
+
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return SetError(
+ ds, -1, "Unexpected character found when decoding array value (1)");
+ }
- ds->objDepth++;
- if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
- return SetError(ds, -1, "Reached object decoding depth limit");
+ itemValue = decode_any(ds);
+
+ if (itemValue == NULL) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return NULL;
}
- newObj = ds->dec->newObject(ds->prv, ds->dec);
+ if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return NULL;
+ }
- ds->start++;
+ SkipWhitespace(ds);
- for (;;) {
- SkipWhitespace(ds);
+ switch (*(ds->start++)) {
+ case ']': {
+ ds->objDepth--;
+ return ds->dec->endArray(ds->prv, newObj);
+ }
+ case ',':
+ break;
- if ((*ds->start) == '}') {
- ds->objDepth--;
- ds->start++;
- return ds->dec->endObject(ds->prv, newObj);
- }
+ default:
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return SetError(
+ ds, -1, "Unexpected character found when decoding array value (2)");
+ }
- ds->lastType = JT_INVALID;
- itemName = decode_any(ds);
+ len++;
+ }
+}
- if (itemName == NULL) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return NULL;
- }
+JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) {
+ JSOBJ itemName;
+ JSOBJ itemValue;
+ JSOBJ newObj;
- if (ds->lastType != JT_UTF8) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- return SetError(
- ds, -1,
- "Key name of object must be 'string' when decoding 'object'");
- }
+ ds->objDepth++;
+ if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
+ return SetError(ds, -1, "Reached object decoding depth limit");
+ }
- SkipWhitespace(ds);
+ newObj = ds->dec->newObject(ds->prv, ds->dec);
- if (*(ds->start++) != ':') {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- return SetError(ds, -1, "No ':' found when decoding object value");
- }
+ ds->start++;
- SkipWhitespace(ds);
+ for (;;) {
+ SkipWhitespace(ds);
- itemValue = decode_any(ds);
+ if ((*ds->start) == '}') {
+ ds->objDepth--;
+ ds->start++;
+ return ds->dec->endObject(ds->prv, newObj);
+ }
- if (itemValue == NULL) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- return NULL;
- }
+ ds->lastType = JT_INVALID;
+ itemName = decode_any(ds);
- if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- ds->dec->releaseObject(ds->prv, itemValue, ds->dec);
- return NULL;
- }
+ if (itemName == NULL) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return NULL;
+ }
- SkipWhitespace(ds);
-
- switch (*(ds->start++)) {
- case '}': {
- ds->objDepth--;
- return ds->dec->endObject(ds->prv, newObj);
- }
- case ',':
- break;
-
- default:
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return SetError(
- ds, -1,
- "Unexpected character found when decoding object value");
- }
+ if (ds->lastType != JT_UTF8) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ return SetError(
+ ds, -1, "Key name of object must be 'string' when decoding 'object'");
}
-}
-JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) {
- for (;;) {
- switch (*ds->start) {
- case '\"':
- return decode_string(ds);
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- case 'I':
- case 'N':
- case '-':
- return decode_numeric(ds);
-
- case '[':
- return decode_array(ds);
- case '{':
- return decode_object(ds);
- case 't':
- return decode_true(ds);
- case 'f':
- return decode_false(ds);
- case 'n':
- return decode_null(ds);
-
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- // White space
- ds->start++;
- break;
-
- default:
- return SetError(ds, -1, "Expected object or value");
- }
+ SkipWhitespace(ds);
+
+ if (*(ds->start++) != ':') {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ return SetError(ds, -1, "No ':' found when decoding object value");
}
-}
-JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
- size_t cbBuffer) {
- /*
- FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode
- escaping doesn't run into the wall each time */
- char *locale;
- struct DecoderState ds;
- wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
- JSOBJ ret;
-
- ds.start = (char *)buffer;
- ds.end = ds.start + cbBuffer;
-
- ds.escStart = escBuffer;
- ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
- ds.escHeap = 0;
- ds.prv = dec->prv;
- ds.dec = dec;
- ds.dec->errorStr = NULL;
- ds.dec->errorOffset = NULL;
- ds.objDepth = 0;
-
- ds.dec = dec;
-
- locale = setlocale(LC_NUMERIC, NULL);
- if (!locale) {
- return SetError(&ds, -1, "setlocale call failed");
+ SkipWhitespace(ds);
+
+ itemValue = decode_any(ds);
+
+ if (itemValue == NULL) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ return NULL;
}
- if (strcmp(locale, "C")) {
- size_t len = strlen(locale) + 1;
- char *saved_locale = malloc(len);
- if (saved_locale == NULL) {
- return SetError(&ds, -1, "Could not reserve memory block");
- }
- memcpy(saved_locale, locale, len);
- setlocale(LC_NUMERIC, "C");
- ret = decode_any(&ds);
- setlocale(LC_NUMERIC, saved_locale);
- free(saved_locale);
- } else {
- ret = decode_any(&ds);
+ if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemValue, ds->dec);
+ return NULL;
}
- if (ds.escHeap) {
- dec->free(ds.escStart);
+ SkipWhitespace(ds);
+
+ switch (*(ds->start++)) {
+ case '}': {
+ ds->objDepth--;
+ return ds->dec->endObject(ds->prv, newObj);
}
+ case ',':
+ break;
- SkipWhitespace(&ds);
+ default:
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return SetError(ds, -1,
+ "Unexpected character found when decoding object value");
+ }
+ }
+}
- if (ds.start != ds.end && ret) {
- dec->releaseObject(ds.prv, ret, ds.dec);
- return SetError(&ds, -1, "Trailing data");
+JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) {
+ for (;;) {
+ switch (*ds->start) {
+ case '\"':
+ return decode_string(ds);
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ case 'I':
+ case 'N':
+ case '-':
+ return decode_numeric(ds);
+
+ case '[':
+ return decode_array(ds);
+ case '{':
+ return decode_object(ds);
+ case 't':
+ return decode_true(ds);
+ case 'f':
+ return decode_false(ds);
+ case 'n':
+ return decode_null(ds);
+
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ // White space
+ ds->start++;
+ break;
+
+ default:
+ return SetError(ds, -1, "Expected object or value");
}
+ }
+}
- return ret;
+JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
+ size_t cbBuffer) {
+ /*
+ FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode
+ escaping doesn't run into the wall each time */
+ char *locale;
+ struct DecoderState ds;
+ wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
+ JSOBJ ret;
+
+ ds.start = (char *)buffer;
+ ds.end = ds.start + cbBuffer;
+
+ ds.escStart = escBuffer;
+ ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
+ ds.escHeap = 0;
+ ds.prv = dec->prv;
+ ds.dec = dec;
+ ds.dec->errorStr = NULL;
+ ds.dec->errorOffset = NULL;
+ ds.objDepth = 0;
+
+ ds.dec = dec;
+
+ locale = setlocale(LC_NUMERIC, NULL);
+ if (!locale) {
+ return SetError(&ds, -1, "setlocale call failed");
+ }
+
+ if (strcmp(locale, "C")) {
+ size_t len = strlen(locale) + 1;
+ char *saved_locale = malloc(len);
+ if (saved_locale == NULL) {
+ return SetError(&ds, -1, "Could not reserve memory block");
+ }
+ memcpy(saved_locale, locale, len);
+ setlocale(LC_NUMERIC, "C");
+ ret = decode_any(&ds);
+ setlocale(LC_NUMERIC, saved_locale);
+ free(saved_locale);
+ } else {
+ ret = decode_any(&ds);
+ }
+
+ if (ds.escHeap) {
+ dec->free(ds.escStart);
+ }
+
+ SkipWhitespace(&ds);
+
+ if (ds.start != ds.end && ret) {
+ dec->releaseObject(ds.prv, ret, ds.dec);
+ return SetError(&ds, -1, "Trailing data");
+ }
+
+ return ret;
}
diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c
index 726676799af65..c8d8b5ab6bd6e 100644
--- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c
+++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c
@@ -38,14 +38,16 @@ Numeric decoder derived from TCL library
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
-#include
-#include
+// Licence at LICENSES/ULTRAJSON_LICENSE
+
+#include "pandas/portable.h"
+#include "pandas/vendored/ujson/lib/ultrajson.h"
#include
#include
+#include
#include
#include
#include
-#include "pandas/vendored/ujson/lib/ultrajson.h"
#ifndef TRUE
#define TRUE 1
@@ -69,7 +71,7 @@ or UTF-16 surrogate pairs
The extra 2 bytes are for the quotes around the string
*/
-#define RESERVE_STRING(_len) (2 + ((_len)*6))
+#define RESERVE_STRING(_len) (2 + ((_len) * 6))
static const double g_pow10[] = {1,
10,
@@ -356,8 +358,8 @@ static const JSUINT8 g_asciiOutputTable[256] = {
1};
static void SetError(JSOBJ obj, JSONObjectEncoder *enc, const char *message) {
- enc->errorMsg = message;
- enc->errorObj = obj;
+ enc->errorMsg = message;
+ enc->errorObj = obj;
}
/*
@@ -365,371 +367,364 @@ FIXME: Keep track of how big these get across several encoder calls and try to
make an estimate
That way we won't run our head into the wall each call */
void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) {
- size_t curSize = enc->end - enc->start;
- size_t newSize = curSize * 2;
- size_t offset = enc->offset - enc->start;
+ size_t curSize = enc->end - enc->start;
+ size_t newSize = curSize * 2;
+ size_t offset = enc->offset - enc->start;
- while (newSize < curSize + cbNeeded) {
- newSize *= 2;
- }
+ while (newSize < curSize + cbNeeded) {
+ newSize *= 2;
+ }
- if (enc->heap) {
- enc->start = (char *)enc->realloc(enc->start, newSize);
- if (!enc->start) {
- SetError(NULL, enc, "Could not reserve memory block");
- return;
- }
- } else {
- char *oldStart = enc->start;
- enc->heap = 1;
- enc->start = (char *)enc->malloc(newSize);
- if (!enc->start) {
- SetError(NULL, enc, "Could not reserve memory block");
- return;
- }
- memcpy(enc->start, oldStart, offset);
+ if (enc->heap) {
+ enc->start = (char *)enc->realloc(enc->start, newSize);
+ if (!enc->start) {
+ SetError(NULL, enc, "Could not reserve memory block");
+ return;
+ }
+ } else {
+ char *oldStart = enc->start;
+ enc->heap = 1;
+ enc->start = (char *)enc->malloc(newSize);
+ if (!enc->start) {
+ SetError(NULL, enc, "Could not reserve memory block");
+ return;
}
- enc->offset = enc->start + offset;
- enc->end = enc->start + newSize;
+ memcpy(enc->start, oldStart, offset);
+ }
+ enc->offset = enc->start + offset;
+ enc->end = enc->start + newSize;
}
INLINE_PREFIX void FASTCALL_MSVC
Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) {
- *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12];
- *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8];
- *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4];
- *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0];
+ *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12];
+ *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8];
+ *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4];
+ *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0];
}
int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io,
const char *end) {
- char *of = (char *)enc->offset;
-
- for (;;) {
- switch (*io) {
- case 0x00: {
- if (io < end) {
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- break;
- } else {
- enc->offset += (of - enc->offset);
- return TRUE;
- }
- }
- case '\"':
- (*of++) = '\\';
- (*of++) = '\"';
- break;
- case '\\':
- (*of++) = '\\';
- (*of++) = '\\';
- break;
- case '/':
- (*of++) = '\\';
- (*of++) = '/';
- break;
- case '\b':
- (*of++) = '\\';
- (*of++) = 'b';
- break;
- case '\f':
- (*of++) = '\\';
- (*of++) = 'f';
- break;
- case '\n':
- (*of++) = '\\';
- (*of++) = 'n';
- break;
- case '\r':
- (*of++) = '\\';
- (*of++) = 'r';
- break;
- case '\t':
- (*of++) = '\\';
- (*of++) = 't';
- break;
-
- case 0x26: // '/'
- case 0x3c: // '<'
- case 0x3e: // '>'
- {
- if (enc->encodeHTMLChars) {
- // Fall through to \u00XX case below.
- } else {
- // Same as default case below.
- (*of++) = (*io);
- break;
- }
- }
- case 0x01:
- case 0x02:
- case 0x03:
- case 0x04:
- case 0x05:
- case 0x06:
- case 0x07:
- case 0x0b:
- case 0x0e:
- case 0x0f:
- case 0x10:
- case 0x11:
- case 0x12:
- case 0x13:
- case 0x14:
- case 0x15:
- case 0x16:
- case 0x17:
- case 0x18:
- case 0x19:
- case 0x1a:
- case 0x1b:
- case 0x1c:
- case 0x1d:
- case 0x1e:
- case 0x1f: {
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
- *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
- break;
- }
- default:
- (*of++) = (*io);
- break;
- }
- io++;
+ char *of = (char *)enc->offset;
+
+ for (;;) {
+ switch (*io) {
+ case 0x00: {
+ if (io < end) {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ break;
+ } else {
+ enc->offset += (of - enc->offset);
+ return TRUE;
+ }
}
+ case '\"':
+ (*of++) = '\\';
+ (*of++) = '\"';
+ break;
+ case '\\':
+ (*of++) = '\\';
+ (*of++) = '\\';
+ break;
+ case '/':
+ (*of++) = '\\';
+ (*of++) = '/';
+ break;
+ case '\b':
+ (*of++) = '\\';
+ (*of++) = 'b';
+ break;
+ case '\f':
+ (*of++) = '\\';
+ (*of++) = 'f';
+ break;
+ case '\n':
+ (*of++) = '\\';
+ (*of++) = 'n';
+ break;
+ case '\r':
+ (*of++) = '\\';
+ (*of++) = 'r';
+ break;
+ case '\t':
+ (*of++) = '\\';
+ (*of++) = 't';
+ break;
+
+ case 0x26: // '/'
+ case 0x3c: // '<'
+ case 0x3e: // '>'
+ {
+ if (enc->encodeHTMLChars) {
+ // Fall through to \u00XX case below.
+ PD_FALLTHROUGH;
+ } else {
+ // Same as default case below.
+ (*of++) = (*io);
+ break;
+ }
+ }
+ case 0x01:
+ case 0x02:
+ case 0x03:
+ case 0x04:
+ case 0x05:
+ case 0x06:
+ case 0x07:
+ case 0x0b:
+ case 0x0e:
+ case 0x0f:
+ case 0x10:
+ case 0x11:
+ case 0x12:
+ case 0x13:
+ case 0x14:
+ case 0x15:
+ case 0x16:
+ case 0x17:
+ case 0x18:
+ case 0x19:
+ case 0x1a:
+ case 0x1b:
+ case 0x1c:
+ case 0x1d:
+ case 0x1e:
+ case 0x1f: {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
+ *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
+ break;
+ }
+ default:
+ (*of++) = (*io);
+ break;
+ }
+ io++;
+ }
}
int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc,
const char *io, const char *end) {
- JSUTF32 ucs;
- char *of = (char *)enc->offset;
-
- for (;;) {
- JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io];
-
- switch (utflen) {
- case 0: {
- if (io < end) {
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- io++;
- continue;
- } else {
- enc->offset += (of - enc->offset);
- return TRUE;
- }
- }
-
- case 1: {
- *(of++) = (*io++);
- continue;
- }
-
- case 2: {
- JSUTF32 in;
- JSUTF16 in16;
-
- if (end - io < 1) {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unterminated UTF-8 sequence when encoding string");
- return FALSE;
- }
-
- memcpy(&in16, io, sizeof(JSUTF16));
- in = (JSUTF32)in16;
+ JSUTF32 ucs;
+ char *of = (char *)enc->offset;
+
+ for (;;) {
+ JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io];
+
+ switch (utflen) {
+ case 0: {
+ if (io < end) {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ io++;
+ continue;
+ } else {
+ enc->offset += (of - enc->offset);
+ return TRUE;
+ }
+ }
+
+ case 1: {
+ *(of++) = (*io++);
+ continue;
+ }
+
+ case 2: {
+ JSUTF32 in;
+ JSUTF16 in16;
+
+ if (end - io < 1) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string");
+ return FALSE;
+ }
+
+ memcpy(&in16, io, sizeof(JSUTF16));
+ in = (JSUTF32)in16;
#ifdef __LITTLE_ENDIAN__
- ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f);
+ ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f);
#else
- ucs = ((in & 0x1f00) >> 2) | (in & 0x3f);
+ ucs = ((in & 0x1f00) >> 2) | (in & 0x3f);
#endif
- if (ucs < 0x80) {
- enc->offset += (of - enc->offset);
- SetError(obj, enc,
- "Overlong 2 byte UTF-8 sequence detected when "
- "encoding string");
- return FALSE;
- }
-
- io += 2;
- break;
- }
-
- case 3: {
- JSUTF32 in;
- JSUTF16 in16;
- JSUINT8 in8;
-
- if (end - io < 2) {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unterminated UTF-8 sequence when encoding string");
- return FALSE;
- }
-
- memcpy(&in16, io, sizeof(JSUTF16));
- memcpy(&in8, io + 2, sizeof(JSUINT8));
+ if (ucs < 0x80) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc,
+ "Overlong 2 byte UTF-8 sequence detected when "
+ "encoding string");
+ return FALSE;
+ }
+
+ io += 2;
+ break;
+ }
+
+ case 3: {
+ JSUTF32 in;
+ JSUTF16 in16;
+ JSUINT8 in8;
+
+ if (end - io < 2) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string");
+ return FALSE;
+ }
+
+ memcpy(&in16, io, sizeof(JSUTF16));
+ memcpy(&in8, io + 2, sizeof(JSUINT8));
#ifdef __LITTLE_ENDIAN__
- in = (JSUTF32)in16;
- in |= in8 << 16;
- ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) |
- ((in & 0x3f0000) >> 16);
+ in = (JSUTF32)in16;
+ in |= in8 << 16;
+ ucs =
+ ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16);
#else
- in = in16 << 8;
- in |= in8;
- ucs =
- ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f);
+ in = in16 << 8;
+ in |= in8;
+ ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f);
#endif
- if (ucs < 0x800) {
- enc->offset += (of - enc->offset);
- SetError(obj, enc,
- "Overlong 3 byte UTF-8 sequence detected when "
- "encoding string");
- return FALSE;
- }
-
- io += 3;
- break;
- }
- case 4: {
- JSUTF32 in;
-
- if (end - io < 3) {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unterminated UTF-8 sequence when encoding string");
- return FALSE;
- }
-
- memcpy(&in, io, sizeof(JSUTF32));
+ if (ucs < 0x800) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc,
+ "Overlong 3 byte UTF-8 sequence detected when "
+ "encoding string");
+ return FALSE;
+ }
+
+ io += 3;
+ break;
+ }
+ case 4: {
+ JSUTF32 in;
+
+ if (end - io < 3) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string");
+ return FALSE;
+ }
+
+ memcpy(&in, io, sizeof(JSUTF32));
#ifdef __LITTLE_ENDIAN__
- ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) |
- ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24);
+ ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) |
+ ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24);
#else
- ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) |
- ((in & 0x3f00) >> 2) | (in & 0x3f);
+ ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) |
+ ((in & 0x3f00) >> 2) | (in & 0x3f);
#endif
- if (ucs < 0x10000) {
- enc->offset += (of - enc->offset);
- SetError(obj, enc,
- "Overlong 4 byte UTF-8 sequence detected when "
- "encoding string");
- return FALSE;
- }
-
- io += 4;
- break;
- }
-
- case 5:
- case 6: {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unsupported UTF-8 sequence length when encoding string");
- return FALSE;
- }
-
- case 29: {
- if (enc->encodeHTMLChars) {
- // Fall through to \u00XX case 30 below.
- } else {
- // Same as case 1 above.
- *(of++) = (*io++);
- continue;
- }
- }
-
- case 30: {
- // \uXXXX encode
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
- *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
- io++;
- continue;
- }
- case 10:
- case 12:
- case 14:
- case 16:
- case 18:
- case 20:
- case 22:
- case 24: {
- *(of++) = *((char *)(g_escapeChars + utflen + 0));
- *(of++) = *((char *)(g_escapeChars + utflen + 1));
- io++;
- continue;
- }
- // This can never happen, it's here to make L4 VC++ happy
- default: {
- ucs = 0;
- break;
- }
- }
-
- /*
- If the character is a UTF8 sequence of length > 1 we end up here */
- if (ucs >= 0x10000) {
- ucs -= 0x10000;
- *(of++) = '\\';
- *(of++) = 'u';
- Buffer_AppendShortHexUnchecked(
- of, (unsigned short)(ucs >> 10) + 0xd800);
- of += 4;
-
- *(of++) = '\\';
- *(of++) = 'u';
- Buffer_AppendShortHexUnchecked(
- of, (unsigned short)(ucs & 0x3ff) + 0xdc00);
- of += 4;
- } else {
- *(of++) = '\\';
- *(of++) = 'u';
- Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs);
- of += 4;
- }
+ if (ucs < 0x10000) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc,
+ "Overlong 4 byte UTF-8 sequence detected when "
+ "encoding string");
+ return FALSE;
+ }
+
+ io += 4;
+ break;
}
+
+ case 5:
+ case 6: {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc,
+ "Unsupported UTF-8 sequence length when encoding string");
+ return FALSE;
+ }
+
+ case 29: {
+ if (enc->encodeHTMLChars) {
+ // Fall through to \u00XX case 30 below.
+ PD_FALLTHROUGH;
+ } else {
+ // Same as case 1 above.
+ *(of++) = (*io++);
+ continue;
+ }
+ }
+
+ case 30: {
+ // \uXXXX encode
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
+ *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
+ io++;
+ continue;
+ }
+ case 10:
+ case 12:
+ case 14:
+ case 16:
+ case 18:
+ case 20:
+ case 22:
+ case 24: {
+ *(of++) = *((char *)(g_escapeChars + utflen + 0));
+ *(of++) = *((char *)(g_escapeChars + utflen + 1));
+ io++;
+ continue;
+ }
+ // This can never happen, it's here to make L4 VC++ happy
+ default: {
+ ucs = 0;
+ break;
+ }
+ }
+
+ /*
+ If the character is a UTF8 sequence of length > 1 we end up here */
+ if (ucs >= 0x10000) {
+ ucs -= 0x10000;
+ *(of++) = '\\';
+ *(of++) = 'u';
+ Buffer_AppendShortHexUnchecked(of, (unsigned short)(ucs >> 10) + 0xd800);
+ of += 4;
+
+ *(of++) = '\\';
+ *(of++) = 'u';
+ Buffer_AppendShortHexUnchecked(of,
+ (unsigned short)(ucs & 0x3ff) + 0xdc00);
+ of += 4;
+ } else {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs);
+ of += 4;
+ }
+ }
}
-#define Buffer_Reserve(__enc, __len) \
- if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \
- { \
- Buffer_Realloc((__enc), (__len));\
- } \
+#define Buffer_Reserve(__enc, __len) \
+ if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \
+ Buffer_Realloc((__enc), (__len)); \
+ }
#define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr;
-INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin,
- char *end) {
- char aux;
- while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux;
+INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, char *end) {
+ char aux;
+ while (end > begin)
+ aux = *end, *end-- = *begin, *begin++ = aux;
}
void Buffer_AppendIndentNewlineUnchecked(JSONObjectEncoder *enc) {
- if (enc->indent > 0) Buffer_AppendCharUnchecked(enc, '\n');
+ if (enc->indent > 0)
+ Buffer_AppendCharUnchecked(enc, '\n');
}
// This function could be refactored to only accept enc as an argument,
@@ -744,167 +739,174 @@ void Buffer_AppendIndentUnchecked(JSONObjectEncoder *enc, JSINT32 value) {
}
void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) {
- char *wstr;
- JSUINT32 uvalue = (value < 0) ? -value : value;
- wstr = enc->offset;
-
- // Conversion. Number is reversed.
- do {
- *wstr++ = (char)(48 + (uvalue % 10));
- } while (uvalue /= 10);
- if (value < 0) *wstr++ = '-';
-
- // Reverse string
- strreverse(enc->offset, wstr - 1);
- enc->offset += (wstr - (enc->offset));
+ char *wstr;
+ JSUINT32 uvalue = (value < 0) ? -value : value;
+ wstr = enc->offset;
+
+ // Conversion. Number is reversed.
+ do {
+ *wstr++ = (char)(48 + (uvalue % 10));
+ } while (uvalue /= 10);
+ if (value < 0)
+ *wstr++ = '-';
+
+ // Reverse string
+ strreverse(enc->offset, wstr - 1);
+ enc->offset += (wstr - (enc->offset));
}
void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) {
- char *wstr;
- JSUINT64 uvalue = (value < 0) ? -value : value;
+ char *wstr;
+ JSUINT64 uvalue;
+ if (value == INT64_MIN) {
+ uvalue = INT64_MAX + UINT64_C(1);
+ } else {
+ uvalue = (value < 0) ? -value : value;
+ }
- wstr = enc->offset;
- // Conversion. Number is reversed.
+ wstr = enc->offset;
+ // Conversion. Number is reversed.
- do {
- *wstr++ = (char)(48 + (uvalue % 10ULL));
- } while (uvalue /= 10ULL);
- if (value < 0) *wstr++ = '-';
+ do {
+ *wstr++ = (char)(48 + (uvalue % 10ULL));
+ } while (uvalue /= 10ULL);
+ if (value < 0)
+ *wstr++ = '-';
- // Reverse string
- strreverse(enc->offset, wstr - 1);
- enc->offset += (wstr - (enc->offset));
+ // Reverse string
+ strreverse(enc->offset, wstr - 1);
+ enc->offset += (wstr - (enc->offset));
}
int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc,
double value) {
- /* if input is beyond the thresholds, revert to exponential */
- const double thres_max = (double)1e16 - 1;
- const double thres_min = (double)1e-15;
- char precision_str[20];
- int count;
- double diff = 0.0;
- char *str = enc->offset;
- char *wstr = str;
- unsigned long long whole;
- double tmp;
- unsigned long long frac;
- int neg;
- double pow10;
-
- if (value == HUGE_VAL || value == -HUGE_VAL) {
- SetError(obj, enc, "Invalid Inf value when encoding double");
- return FALSE;
- }
+ /* if input is beyond the thresholds, revert to exponential */
+ const double thres_max = (double)1e16 - 1;
+ const double thres_min = (double)1e-15;
+ char precision_str[20];
+ int count;
+ double diff = 0.0;
+ char *str = enc->offset;
+ char *wstr = str;
+ unsigned long long whole;
+ double tmp;
+ unsigned long long frac;
+ int neg;
+ double pow10;
+
+ if (value == HUGE_VAL || value == -HUGE_VAL) {
+ SetError(obj, enc, "Invalid Inf value when encoding double");
+ return FALSE;
+ }
- if (!(value == value)) {
- SetError(obj, enc, "Invalid Nan value when encoding double");
- return FALSE;
- }
+ if (!(value == value)) {
+ SetError(obj, enc, "Invalid Nan value when encoding double");
+ return FALSE;
+ }
- /* we'll work in positive values and deal with the
- negative sign issue later */
- neg = 0;
- if (value < 0) {
- neg = 1;
- value = -value;
- }
+ /* we'll work in positive values and deal with the
+ negative sign issue later */
+ neg = 0;
+ if (value < 0) {
+ neg = 1;
+ value = -value;
+ }
- /*
- for very large or small numbers switch back to native sprintf for
- exponentials. anyone want to write code to replace this? */
- if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) {
- precision_str[0] = '%';
- precision_str[1] = '.';
+ /*
+ for very large or small numbers switch back to native sprintf for
+ exponentials. anyone want to write code to replace this? */
+ if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) {
+ precision_str[0] = '%';
+ precision_str[1] = '.';
#if defined(_WIN32) && defined(_MSC_VER)
- sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug",
- enc->doublePrecision);
- enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str,
- neg ? -value : value);
+ sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug",
+ enc->doublePrecision);
+ enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str,
+ neg ? -value : value);
#else
- snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug",
- enc->doublePrecision);
- enc->offset += snprintf(str, enc->end - enc->offset, precision_str,
- neg ? -value : value);
+ snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug",
+ enc->doublePrecision);
+ enc->offset += snprintf(str, enc->end - enc->offset, precision_str,
+ neg ? -value : value);
#endif
- return TRUE;
- }
+ return TRUE;
+ }
- pow10 = g_pow10[enc->doublePrecision];
+ pow10 = g_pow10[enc->doublePrecision];
- whole = (unsigned long long)value;
- tmp = (value - whole) * pow10;
- frac = (unsigned long long)(tmp);
- diff = tmp - frac;
+ whole = (unsigned long long)value;
+ tmp = (value - whole) * pow10;
+ frac = (unsigned long long)(tmp);
+ diff = tmp - frac;
+
+ if (diff > 0.5) {
+ ++frac;
+ } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) {
+ /* if halfway, round up if odd, OR
+ if last digit is 0. That last part is strange */
+ ++frac;
+ }
+
+ // handle rollover, e.g.
+ // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well
+ if (frac >= pow10) {
+ frac = 0;
+ ++whole;
+ }
+
+ if (enc->doublePrecision == 0) {
+ diff = value - whole;
if (diff > 0.5) {
- ++frac;
- } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) {
- /* if halfway, round up if odd, OR
- if last digit is 0. That last part is strange */
- ++frac;
+ /* greater than 0.5, round up, e.g. 1.6 -> 2 */
+ ++whole;
+ } else if (diff == 0.5 && (whole & 1)) {
+ /* exactly 0.5 and ODD, then round up */
+ /* 1.5 -> 2, but 2.5 -> 2 */
+ ++whole;
}
- // handle rollover, e.g.
- // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well
- if (frac >= pow10) {
- frac = 0;
- ++whole;
+ // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2
+ } else if (frac) {
+ count = enc->doublePrecision;
+ // now do fractional part, as an unsigned number
+ // we know it is not 0 but we can have leading zeros, these
+ // should be removed
+ while (!(frac % 10)) {
+ --count;
+ frac /= 10;
}
+ //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2
- if (enc->doublePrecision == 0) {
- diff = value - whole;
-
- if (diff > 0.5) {
- /* greater than 0.5, round up, e.g. 1.6 -> 2 */
- ++whole;
- } else if (diff == 0.5 && (whole & 1)) {
- /* exactly 0.5 and ODD, then round up */
- /* 1.5 -> 2, but 2.5 -> 2 */
- ++whole;
- }
-
- // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2
- } else if (frac) {
- count = enc->doublePrecision;
- // now do fractional part, as an unsigned number
- // we know it is not 0 but we can have leading zeros, these
- // should be removed
- while (!(frac % 10)) {
- --count;
- frac /= 10;
- }
- //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2
-
- // now do fractional part, as an unsigned number
- do {
- --count;
- *wstr++ = (char)(48 + (frac % 10));
- } while (frac /= 10);
- // add extra 0s
- while (count-- > 0) {
- *wstr++ = '0';
- }
- // add decimal
- *wstr++ = '.';
- } else {
- *wstr++ = '0';
- *wstr++ = '.';
+ // now do fractional part, as an unsigned number
+ do {
+ --count;
+ *wstr++ = (char)(48 + (frac % 10));
+ } while (frac /= 10);
+ // add extra 0s
+ while (count-- > 0) {
+ *wstr++ = '0';
}
+ // add decimal
+ *wstr++ = '.';
+ } else {
+ *wstr++ = '0';
+ *wstr++ = '.';
+ }
- // Do whole part. Take care of sign
- // conversion. Number is reversed.
- do {
- *wstr++ = (char)(48 + (whole % 10));
- } while (whole /= 10);
+ // Do whole part. Take care of sign
+ // conversion. Number is reversed.
+ do {
+ *wstr++ = (char)(48 + (whole % 10));
+ } while (whole /= 10);
- if (neg) {
- *wstr++ = '-';
- }
- strreverse(str, wstr - 1);
- enc->offset += (wstr - (enc->offset));
+ if (neg) {
+ *wstr++ = '-';
+ }
+ strreverse(str, wstr - 1);
+ enc->offset += (wstr - (enc->offset));
- return TRUE;
+ return TRUE;
}
/*
@@ -917,291 +919,287 @@ Perhaps implement recursion detection */
void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
size_t cbName) {
- const char *value;
- char *objName;
- int count;
- JSOBJ iterObj;
- size_t szlen;
- JSONTypeContext tc;
- tc.encoder = enc;
-
- if (enc->level > enc->recursionMax) {
- SetError(obj, enc, "Maximum recursion level reached");
- return;
- }
+ const char *value;
+ char *objName;
+ int count;
+ JSOBJ iterObj;
+ size_t szlen;
+ JSONTypeContext tc;
+ tc.encoder = enc;
+
+ if (enc->level > enc->recursionMax) {
+ SetError(obj, enc, "Maximum recursion level reached");
+ return;
+ }
- /*
- This reservation must hold
+ /*
+ This reservation must hold
- length of _name as encoded worst case +
- maxLength of double to string OR maxLength of JSLONG to string
- */
+ length of _name as encoded worst case +
+ maxLength of double to string OR maxLength of JSLONG to string
+ */
- Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName));
- if (enc->errorMsg) {
- return;
- }
+ Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName));
+ if (enc->errorMsg) {
+ return;
+ }
- if (name) {
- Buffer_AppendCharUnchecked(enc, '\"');
+ if (name) {
+ Buffer_AppendCharUnchecked(enc, '\"');
- if (enc->forceASCII) {
- if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) {
- return;
- }
- } else {
- if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) {
- return;
- }
- }
+ if (enc->forceASCII) {
+ if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) {
+ return;
+ }
+ } else {
+ if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) {
+ return;
+ }
+ }
- Buffer_AppendCharUnchecked(enc, '\"');
+ Buffer_AppendCharUnchecked(enc, '\"');
- Buffer_AppendCharUnchecked(enc, ':');
+ Buffer_AppendCharUnchecked(enc, ':');
#ifndef JSON_NO_EXTRA_WHITESPACE
- Buffer_AppendCharUnchecked(enc, ' ');
+ Buffer_AppendCharUnchecked(enc, ' ');
#endif
- }
+ }
- enc->beginTypeContext(obj, &tc);
+ enc->beginTypeContext(obj, &tc);
- switch (tc.type) {
- case JT_INVALID: {
- return;
- }
+ switch (tc.type) {
+ case JT_INVALID: {
+ return;
+ }
- case JT_ARRAY: {
- count = 0;
- enc->iterBegin(obj, &tc);
+ case JT_ARRAY: {
+ count = 0;
+ enc->iterBegin(obj, &tc);
- Buffer_AppendCharUnchecked(enc, '[');
- Buffer_AppendIndentNewlineUnchecked(enc);
+ Buffer_AppendCharUnchecked(enc, '[');
+ Buffer_AppendIndentNewlineUnchecked(enc);
- while (enc->iterNext(obj, &tc)) {
- if (count > 0) {
- Buffer_AppendCharUnchecked(enc, ',');
+ while (enc->iterNext(obj, &tc)) {
+ if (count > 0) {
+ Buffer_AppendCharUnchecked(enc, ',');
#ifndef JSON_NO_EXTRA_WHITESPACE
- Buffer_AppendCharUnchecked(buffer, ' ');
+ Buffer_AppendCharUnchecked(buffer, ' ');
#endif
- Buffer_AppendIndentNewlineUnchecked(enc);
- }
-
- iterObj = enc->iterGetValue(obj, &tc);
-
- enc->level++;
- Buffer_AppendIndentUnchecked(enc, enc->level);
- encode(iterObj, enc, NULL, 0);
- count++;
- }
-
- enc->iterEnd(obj, &tc);
- Buffer_AppendIndentNewlineUnchecked(enc);
- Buffer_AppendIndentUnchecked(enc, enc->level);
- Buffer_AppendCharUnchecked(enc, ']');
- break;
- }
-
- case JT_OBJECT: {
- count = 0;
- enc->iterBegin(obj, &tc);
-
- Buffer_AppendCharUnchecked(enc, '{');
- Buffer_AppendIndentNewlineUnchecked(enc);
-
- while (enc->iterNext(obj, &tc)) {
- if (count > 0) {
- Buffer_AppendCharUnchecked(enc, ',');
+ Buffer_AppendIndentNewlineUnchecked(enc);
+ }
+
+ iterObj = enc->iterGetValue(obj, &tc);
+
+ enc->level++;
+ Buffer_AppendIndentUnchecked(enc, enc->level);
+ encode(iterObj, enc, NULL, 0);
+ count++;
+ }
+
+ enc->iterEnd(obj, &tc);
+ Buffer_AppendIndentNewlineUnchecked(enc);
+ Buffer_AppendIndentUnchecked(enc, enc->level);
+ Buffer_AppendCharUnchecked(enc, ']');
+ break;
+ }
+
+ case JT_OBJECT: {
+ count = 0;
+ enc->iterBegin(obj, &tc);
+
+ Buffer_AppendCharUnchecked(enc, '{');
+ Buffer_AppendIndentNewlineUnchecked(enc);
+
+ while (enc->iterNext(obj, &tc)) {
+ if (count > 0) {
+ Buffer_AppendCharUnchecked(enc, ',');
#ifndef JSON_NO_EXTRA_WHITESPACE
- Buffer_AppendCharUnchecked(enc, ' ');
+ Buffer_AppendCharUnchecked(enc, ' ');
#endif
- Buffer_AppendIndentNewlineUnchecked(enc);
- }
-
- iterObj = enc->iterGetValue(obj, &tc);
- objName = enc->iterGetName(obj, &tc, &szlen);
-
- enc->level++;
- Buffer_AppendIndentUnchecked(enc, enc->level);
- encode(iterObj, enc, objName, szlen);
- count++;
- }
-
- enc->iterEnd(obj, &tc);
- Buffer_AppendIndentNewlineUnchecked(enc);
- Buffer_AppendIndentUnchecked(enc, enc->level);
- Buffer_AppendCharUnchecked(enc, '}');
- break;
- }
-
- case JT_LONG: {
- Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc));
- break;
- }
-
- case JT_INT: {
- Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc));
- break;
- }
-
- case JT_TRUE: {
- Buffer_AppendCharUnchecked(enc, 't');
- Buffer_AppendCharUnchecked(enc, 'r');
- Buffer_AppendCharUnchecked(enc, 'u');
- Buffer_AppendCharUnchecked(enc, 'e');
- break;
- }
-
- case JT_FALSE: {
- Buffer_AppendCharUnchecked(enc, 'f');
- Buffer_AppendCharUnchecked(enc, 'a');
- Buffer_AppendCharUnchecked(enc, 'l');
- Buffer_AppendCharUnchecked(enc, 's');
- Buffer_AppendCharUnchecked(enc, 'e');
- break;
- }
-
- case JT_NULL: {
- Buffer_AppendCharUnchecked(enc, 'n');
- Buffer_AppendCharUnchecked(enc, 'u');
- Buffer_AppendCharUnchecked(enc, 'l');
- Buffer_AppendCharUnchecked(enc, 'l');
- break;
- }
-
- case JT_DOUBLE: {
- if (!Buffer_AppendDoubleUnchecked(obj, enc,
- enc->getDoubleValue(obj, &tc))) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- break;
- }
-
- case JT_UTF8: {
- value = enc->getStringValue(obj, &tc, &szlen);
- if (enc->errorMsg) {
- enc->endTypeContext(obj, &tc);
- return;
- }
- Buffer_Reserve(enc, RESERVE_STRING(szlen));
- Buffer_AppendCharUnchecked(enc, '\"');
-
- if (enc->forceASCII) {
- if (!Buffer_EscapeStringValidated(obj, enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- } else {
- if (!Buffer_EscapeStringUnvalidated(enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- }
-
- Buffer_AppendCharUnchecked(enc, '\"');
- break;
- }
-
- case JT_BIGNUM: {
- value = enc->getBigNumStringValue(obj, &tc, &szlen);
-
- Buffer_Reserve(enc, RESERVE_STRING(szlen));
- if (enc->errorMsg) {
- enc->endTypeContext(obj, &tc);
- return;
- }
-
- if (enc->forceASCII) {
- if (!Buffer_EscapeStringValidated(obj, enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- } else {
- if (!Buffer_EscapeStringUnvalidated(enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- }
-
- break;
- }
+ Buffer_AppendIndentNewlineUnchecked(enc);
+ }
+
+ iterObj = enc->iterGetValue(obj, &tc);
+ objName = enc->iterGetName(obj, &tc, &szlen);
+
+ enc->level++;
+ Buffer_AppendIndentUnchecked(enc, enc->level);
+ encode(iterObj, enc, objName, szlen);
+ count++;
}
- enc->endTypeContext(obj, &tc);
- enc->level--;
-}
+ enc->iterEnd(obj, &tc);
+ Buffer_AppendIndentNewlineUnchecked(enc);
+ Buffer_AppendIndentUnchecked(enc, enc->level);
+ Buffer_AppendCharUnchecked(enc, '}');
+ break;
+ }
-char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer,
- size_t _cbBuffer) {
- char *locale;
- enc->malloc = enc->malloc ? enc->malloc : malloc;
- enc->free = enc->free ? enc->free : free;
- enc->realloc = enc->realloc ? enc->realloc : realloc;
- enc->errorMsg = NULL;
- enc->errorObj = NULL;
- enc->level = 0;
-
- if (enc->recursionMax < 1) {
- enc->recursionMax = JSON_MAX_RECURSION_DEPTH;
+ case JT_LONG: {
+ Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc));
+ break;
+ }
+
+ case JT_INT: {
+ Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc));
+ break;
+ }
+
+ case JT_TRUE: {
+ Buffer_AppendCharUnchecked(enc, 't');
+ Buffer_AppendCharUnchecked(enc, 'r');
+ Buffer_AppendCharUnchecked(enc, 'u');
+ Buffer_AppendCharUnchecked(enc, 'e');
+ break;
+ }
+
+ case JT_FALSE: {
+ Buffer_AppendCharUnchecked(enc, 'f');
+ Buffer_AppendCharUnchecked(enc, 'a');
+ Buffer_AppendCharUnchecked(enc, 'l');
+ Buffer_AppendCharUnchecked(enc, 's');
+ Buffer_AppendCharUnchecked(enc, 'e');
+ break;
+ }
+
+ case JT_NULL: {
+ Buffer_AppendCharUnchecked(enc, 'n');
+ Buffer_AppendCharUnchecked(enc, 'u');
+ Buffer_AppendCharUnchecked(enc, 'l');
+ Buffer_AppendCharUnchecked(enc, 'l');
+ break;
+ }
+
+ case JT_DOUBLE: {
+ if (!Buffer_AppendDoubleUnchecked(obj, enc,
+ enc->getDoubleValue(obj, &tc))) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
}
+ break;
+ }
- if (enc->doublePrecision < 0 ||
- enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) {
- enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS;
+ case JT_UTF8: {
+ value = enc->getStringValue(obj, &tc, &szlen);
+ if (enc->errorMsg) {
+ enc->endTypeContext(obj, &tc);
+ return;
}
+ Buffer_Reserve(enc, RESERVE_STRING(szlen));
+ Buffer_AppendCharUnchecked(enc, '\"');
- if (_buffer == NULL) {
- _cbBuffer = 32768;
- enc->start = (char *)enc->malloc(_cbBuffer);
- if (!enc->start) {
- SetError(obj, enc, "Could not reserve memory block");
- return NULL;
- }
- enc->heap = 1;
+ if (enc->forceASCII) {
+ if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
+ }
} else {
- enc->start = _buffer;
- enc->heap = 0;
+ if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
+ }
}
- enc->end = enc->start + _cbBuffer;
- enc->offset = enc->start;
+ Buffer_AppendCharUnchecked(enc, '\"');
+ break;
+ }
- locale = setlocale(LC_NUMERIC, NULL);
- if (!locale) {
- SetError(NULL, enc, "setlocale call failed");
- return NULL;
+ case JT_BIGNUM: {
+ value = enc->getBigNumStringValue(obj, &tc, &szlen);
+
+ Buffer_Reserve(enc, RESERVE_STRING(szlen));
+ if (enc->errorMsg) {
+ enc->endTypeContext(obj, &tc);
+ return;
}
- if (strcmp(locale, "C")) {
- size_t len = strlen(locale) + 1;
- char *saved_locale = malloc(len);
- if (saved_locale == NULL) {
- SetError(NULL, enc, "Could not reserve memory block");
- return NULL;
- }
- memcpy(saved_locale, locale, len);
- setlocale(LC_NUMERIC, "C");
- encode(obj, enc, NULL, 0);
- setlocale(LC_NUMERIC, saved_locale);
- free(saved_locale);
+ if (enc->forceASCII) {
+ if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
+ }
} else {
- encode(obj, enc, NULL, 0);
+ if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
+ }
}
- Buffer_Reserve(enc, 1);
- if (enc->errorMsg) {
- return NULL;
+ break;
+ }
+ }
+
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+}
+
+char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer,
+ size_t _cbBuffer) {
+ char *locale;
+ enc->malloc = enc->malloc ? enc->malloc : malloc;
+ enc->free = enc->free ? enc->free : free;
+ enc->realloc = enc->realloc ? enc->realloc : realloc;
+ enc->errorMsg = NULL;
+ enc->errorObj = NULL;
+ enc->level = 0;
+
+ if (enc->recursionMax < 1) {
+ enc->recursionMax = JSON_MAX_RECURSION_DEPTH;
+ }
+
+ if (enc->doublePrecision < 0 ||
+ enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) {
+ enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS;
+ }
+
+ if (_buffer == NULL) {
+ _cbBuffer = 32768;
+ enc->start = (char *)enc->malloc(_cbBuffer);
+ if (!enc->start) {
+ SetError(obj, enc, "Could not reserve memory block");
+ return NULL;
}
- Buffer_AppendCharUnchecked(enc, '\0');
+ enc->heap = 1;
+ } else {
+ enc->start = _buffer;
+ enc->heap = 0;
+ }
+
+ enc->end = enc->start + _cbBuffer;
+ enc->offset = enc->start;
+
+ locale = setlocale(LC_NUMERIC, NULL);
+ if (!locale) {
+ SetError(NULL, enc, "setlocale call failed");
+ return NULL;
+ }
+
+ if (strcmp(locale, "C")) {
+ size_t len = strlen(locale) + 1;
+ char *saved_locale = malloc(len);
+ if (saved_locale == NULL) {
+ SetError(NULL, enc, "Could not reserve memory block");
+ return NULL;
+ }
+ memcpy(saved_locale, locale, len);
+ setlocale(LC_NUMERIC, "C");
+ encode(obj, enc, NULL, 0);
+ setlocale(LC_NUMERIC, saved_locale);
+ free(saved_locale);
+ } else {
+ encode(obj, enc, NULL, 0);
+ }
+
+ Buffer_Reserve(enc, 1);
+ if (enc->errorMsg) {
+ return NULL;
+ }
+ Buffer_AppendCharUnchecked(enc, '\0');
- return enc->start;
+ return enc->start;
}
diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
index f4055fcedcfa6..7cc20a52f1849 100644
--- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
+++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
@@ -16,18 +16,19 @@ modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
+reserved.
Numeric decoder derived from TCL library
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
@@ -35,486 +36,135 @@ Numeric decoder derived from TCL library
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
-#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
-#define NO_IMPORT_ARRAY
+// Licence at LICENSES/ULTRAJSON_LICENSE
+
+#include "pandas/vendored/ujson/lib/ultrajson.h"
#define PY_SSIZE_T_CLEAN
#include
-#include
-#include "pandas/vendored/ujson/lib/ultrajson.h"
-#define PRINTMARK()
-
-typedef struct __PyObjectDecoder {
- JSONObjectDecoder dec;
-
- void *npyarr; // Numpy context buffer
- void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls
- npy_intp curdim; // Current array dimension
-
- PyArray_Descr *dtype;
-} PyObjectDecoder;
-
-typedef struct __NpyArrContext {
- PyObject *ret;
- PyObject *labels[2];
- PyArray_Dims shape;
-
- PyObjectDecoder *dec;
-
- npy_intp i;
- npy_intp elsize;
- npy_intp elcount;
-} NpyArrContext;
-
-// Numpy handling based on numpy internal code, specifically the function
-// PyArray_FromIter.
-
-// numpy related functions are inter-dependent so declare them all here,
-// to ensure the compiler catches any errors
-
-// standard numpy array handling
-JSOBJ Object_npyNewArray(void *prv, void *decoder);
-JSOBJ Object_npyEndArray(void *prv, JSOBJ obj);
-int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value);
-
-// for more complex dtypes (object and string) fill a standard Python list
-// and convert to a numpy array when done.
-JSOBJ Object_npyNewArrayList(void *prv, void *decoder);
-JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj);
-int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value);
-
-// free the numpy context buffer
-void Npy_releaseContext(NpyArrContext *npyarr) {
- PRINTMARK();
- if (npyarr) {
- if (npyarr->shape.ptr) {
- PyObject_Free(npyarr->shape.ptr);
- }
- if (npyarr->dec) {
- npyarr->dec->npyarr = NULL;
- npyarr->dec->curdim = 0;
- }
- Py_XDECREF(npyarr->labels[0]);
- Py_XDECREF(npyarr->labels[1]);
- Py_XDECREF(npyarr->ret);
- PyObject_Free(npyarr);
- }
+static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name,
+ JSOBJ value) {
+ int ret = PyDict_SetItem(obj, name, value);
+ Py_DECREF((PyObject *)name);
+ Py_DECREF((PyObject *)value);
+ return ret == 0 ? 1 : 0;
}
-JSOBJ Object_npyNewArray(void *prv, void *_decoder) {
- NpyArrContext *npyarr;
- PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
- PRINTMARK();
- if (decoder->curdim <= 0) {
- // start of array - initialise the context buffer
- npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext));
- decoder->npyarr_addr = npyarr;
-
- if (!npyarr) {
- PyErr_NoMemory();
- return NULL;
- }
-
- npyarr->dec = decoder;
- npyarr->labels[0] = npyarr->labels[1] = NULL;
-
- npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS);
- npyarr->shape.len = 1;
- npyarr->ret = NULL;
-
- npyarr->elsize = 0;
- npyarr->elcount = 4;
- npyarr->i = 0;
- } else {
- // starting a new dimension continue the current array (and reshape
- // after)
- npyarr = (NpyArrContext *)decoder->npyarr;
- if (decoder->curdim >= npyarr->shape.len) {
- npyarr->shape.len++;
- }
- }
-
- npyarr->shape.ptr[decoder->curdim] = 0;
- decoder->curdim++;
- return npyarr;
+static int Object_arrayAddItem(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ value) {
+ int ret = PyList_Append(obj, value);
+ Py_DECREF((PyObject *)value);
+ return ret == 0 ? 1 : 0;
}
-PyObject *Npy_returnLabelled(NpyArrContext *npyarr) {
- PyObject *ret = npyarr->ret;
- npy_intp i;
-
- if (npyarr->labels[0] || npyarr->labels[1]) {
- // finished decoding, build tuple with values and labels
- ret = PyTuple_New(npyarr->shape.len + 1);
- for (i = 0; i < npyarr->shape.len; i++) {
- if (npyarr->labels[i]) {
- PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]);
- npyarr->labels[i] = NULL;
- } else {
- Py_INCREF(Py_None);
- PyTuple_SET_ITEM(ret, i + 1, Py_None);
- }
- }
- PyTuple_SET_ITEM(ret, 0, npyarr->ret);
- }
-
- return ret;
-}
-
-JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) {
- PyObject *ret;
- char *new_data;
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- int emptyType = NPY_DEFAULT_TYPE;
- npy_intp i;
- PRINTMARK();
- if (!npyarr) {
- return NULL;
- }
-
- ret = npyarr->ret;
- i = npyarr->i;
-
- npyarr->dec->curdim--;
-
- if (i == 0 || !npyarr->ret) {
- // empty array would not have been initialised so do it now.
- if (npyarr->dec->dtype) {
- emptyType = npyarr->dec->dtype->type_num;
- }
- npyarr->ret = ret =
- PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0);
- } else if (npyarr->dec->curdim <= 0) {
- // realloc to final size
- new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize);
- if (new_data == NULL) {
- PyErr_NoMemory();
- Npy_releaseContext(npyarr);
- return NULL;
- }
- ((PyArrayObject *)ret)->data = (void *)new_data;
- // PyArray_BYTES(ret) = new_data;
- }
-
- if (npyarr->dec->curdim <= 0) {
- // finished decoding array, reshape if necessary
- if (npyarr->shape.len > 1) {
- npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape,
- NPY_ANYORDER);
- Py_DECREF(ret);
- }
-
- ret = Npy_returnLabelled(npyarr);
-
- npyarr->ret = NULL;
- Npy_releaseContext(npyarr);
- }
-
- return ret;
+static JSOBJ Object_newString(void *Py_UNUSED(prv), wchar_t *start,
+ wchar_t *end) {
+ return PyUnicode_FromWideChar(start, (end - start));
}
-int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
- PyObject *type;
- PyArray_Descr *dtype;
- npy_intp i;
- char *new_data, *item;
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- PRINTMARK();
- if (!npyarr) {
- return 0;
- }
-
- i = npyarr->i;
-
- npyarr->shape.ptr[npyarr->dec->curdim - 1]++;
-
- if (PyArray_Check((PyObject *)value)) {
- // multidimensional array, keep decoding values.
- return 1;
- }
-
- if (!npyarr->ret) {
- // Array not initialised yet.
- // We do it here so we can 'sniff' the data type if none was provided
- if (!npyarr->dec->dtype) {
- type = PyObject_Type(value);
- if (!PyArray_DescrConverter(type, &dtype)) {
- Py_DECREF(type);
- goto fail;
- }
- Py_INCREF(dtype);
- Py_DECREF(type);
- } else {
- dtype = PyArray_DescrNew(npyarr->dec->dtype);
- }
-
- // If it's an object or string then fill a Python list and subsequently
- // convert. Otherwise we would need to somehow mess about with
- // reference counts when renewing memory.
- npyarr->elsize = dtype->elsize;
- if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) {
- Py_XDECREF(dtype);
-
- if (npyarr->dec->curdim > 1) {
- PyErr_SetString(PyExc_ValueError,
- "Cannot decode multidimensional arrays with "
- "variable length elements to numpy");
- goto fail;
- }
- npyarr->elcount = 0;
- npyarr->ret = PyList_New(0);
- if (!npyarr->ret) {
- goto fail;
- }
- ((JSONObjectDecoder *)npyarr->dec)->newArray =
- Object_npyNewArrayList;
- ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem =
- Object_npyArrayListAddItem;
- ((JSONObjectDecoder *)npyarr->dec)->endArray =
- Object_npyEndArrayList;
- return Object_npyArrayListAddItem(prv, obj, value);
- }
-
- npyarr->ret = PyArray_NewFromDescr(
- &PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL);
-
- if (!npyarr->ret) {
- goto fail;
- }
- }
-
- if (i >= npyarr->elcount) {
- // Grow PyArray_DATA(ret):
- // this is similar for the strategy for PyListObject, but we use
- // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ...
- if (npyarr->elsize == 0) {
- PyErr_SetString(PyExc_ValueError,
- "Cannot decode multidimensional arrays with "
- "variable length elements to numpy");
- goto fail;
- }
-
- npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
- if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) {
- new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret),
- npyarr->elcount * npyarr->elsize);
- } else {
- PyErr_NoMemory();
- goto fail;
- }
- ((PyArrayObject *)npyarr->ret)->data = (void *)new_data;
-
- // PyArray_BYTES(npyarr->ret) = new_data;
- }
-
- PyArray_DIMS(npyarr->ret)[0] = i + 1;
-
- if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL ||
- PyArray_SETITEM(npyarr->ret, item, value) == -1) {
- goto fail;
- }
+static JSOBJ Object_newTrue(void *Py_UNUSED(prv)) { Py_RETURN_TRUE; }
- Py_DECREF((PyObject *)value);
- npyarr->i++;
- return 1;
+static JSOBJ Object_newFalse(void *Py_UNUSED(prv)) { Py_RETURN_FALSE; }
-fail:
+static JSOBJ Object_newNull(void *Py_UNUSED(prv)) { Py_RETURN_NONE; }
- Npy_releaseContext(npyarr);
- return 0;
+static JSOBJ Object_newPosInf(void *Py_UNUSED(prv)) {
+ return PyFloat_FromDouble(Py_HUGE_VAL);
}
-JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) {
- PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
- PRINTMARK();
- PyErr_SetString(
- PyExc_ValueError,
- "nesting not supported for object or variable length dtypes");
- Npy_releaseContext(decoder->npyarr);
- return NULL;
+static JSOBJ Object_newNegInf(void *Py_UNUSED(prv)) {
+ return PyFloat_FromDouble(-Py_HUGE_VAL);
}
-JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) {
- PyObject *list, *ret;
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- PRINTMARK();
- if (!npyarr) {
- return NULL;
- }
-
- // convert decoded list to numpy array
- list = (PyObject *)npyarr->ret;
- npyarr->ret = PyArray_FROM_O(list);
-
- ret = Npy_returnLabelled(npyarr);
- npyarr->ret = list;
-
- ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray;
- ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem;
- ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray;
- Npy_releaseContext(npyarr);
- return ret;
+static JSOBJ Object_newObject(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) {
+ return PyDict_New();
}
-int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) {
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- PRINTMARK();
- if (!npyarr) {
- return 0;
- }
- PyList_Append((PyObject *)npyarr->ret, value);
- Py_DECREF((PyObject *)value);
- npyarr->elcount++;
- return 1;
-}
+static JSOBJ Object_endObject(void *Py_UNUSED(prv), JSOBJ obj) { return obj; }
-int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
- int ret = PyDict_SetItem(obj, name, value);
- Py_DECREF((PyObject *)name);
- Py_DECREF((PyObject *)value);
- return ret == 0 ? 1 : 0;
+static JSOBJ Object_newArray(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) {
+ return PyList_New(0);
}
-int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
- int ret = PyList_Append(obj, value);
- Py_DECREF((PyObject *)value);
- return ret == 0 ? 1 : 0;
-}
+static JSOBJ Object_endArray(void *Py_UNUSED(prv), JSOBJ obj) { return obj; }
-JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {
- return PyUnicode_FromWideChar(start, (end - start));
+static JSOBJ Object_newInteger(void *Py_UNUSED(prv), JSINT32 value) {
+ return PyLong_FromLong(value);
}
-JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; }
-
-JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; }
-
-JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; }
-
-JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); }
-
-JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); }
-
-JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); }
-
-JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; }
-
-JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); }
-
-JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; }
-
-JSOBJ Object_newInteger(void *prv, JSINT32 value) {
- return PyLong_FromLong((long)value);
+static JSOBJ Object_newLong(void *Py_UNUSED(prv), JSINT64 value) {
+ return PyLong_FromLongLong(value);
}
-JSOBJ Object_newLong(void *prv, JSINT64 value) {
- return PyLong_FromLongLong(value);
+static JSOBJ Object_newUnsignedLong(void *Py_UNUSED(prv), JSUINT64 value) {
+ return PyLong_FromUnsignedLongLong(value);
}
-JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) {
- return PyLong_FromUnsignedLongLong(value);
+static JSOBJ Object_newDouble(void *Py_UNUSED(prv), double value) {
+ return PyFloat_FromDouble(value);
}
-JSOBJ Object_newDouble(void *prv, double value) {
- return PyFloat_FromDouble(value);
+static void Object_releaseObject(void *Py_UNUSED(prv), JSOBJ obj,
+ void *Py_UNUSED(decoder)) {
+ Py_XDECREF(((PyObject *)obj));
}
-static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
- PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
- if (obj != decoder->npyarr_addr) {
- Py_XDECREF(((PyObject *)obj));
- }
-}
+PyObject *JSONToObj(PyObject *Py_UNUSED(self), PyObject *args,
+ PyObject *kwargs) {
+ JSONObjectDecoder dec = {.newString = Object_newString,
+ .objectAddKey = Object_objectAddKey,
+ .arrayAddItem = Object_arrayAddItem,
+ .newTrue = Object_newTrue,
+ .newFalse = Object_newFalse,
+ .newNull = Object_newNull,
+ .newPosInf = Object_newPosInf,
+ .newNegInf = Object_newNegInf,
+ .newObject = Object_newObject,
+ .endObject = Object_endObject,
+ .newArray = Object_newArray,
+ .endArray = Object_endArray,
+ .newInt = Object_newInteger,
+ .newLong = Object_newLong,
+ .newUnsignedLong = Object_newUnsignedLong,
+ .newDouble = Object_newDouble,
+ .releaseObject = Object_releaseObject,
+ .malloc = PyObject_Malloc,
+ .free = PyObject_Free,
+ .realloc = PyObject_Realloc,
+ .errorStr = NULL,
+ .errorOffset = NULL,
+ .preciseFloat = 0,
+ .prv = NULL};
-static char *g_kwlist[] = {"obj", "precise_float",
- "labelled", "dtype", NULL};
-
-PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
- PyObject *ret;
- PyObject *sarg;
- PyObject *arg;
- PyObject *opreciseFloat = NULL;
- JSONObjectDecoder *decoder;
- PyObjectDecoder pyDecoder;
- PyArray_Descr *dtype = NULL;
- int labelled = 0;
-
- JSONObjectDecoder dec = {
- Object_newString, Object_objectAddKey, Object_arrayAddItem,
- Object_newTrue, Object_newFalse, Object_newNull,
- Object_newPosInf, Object_newNegInf, Object_newObject,
- Object_endObject, Object_newArray, Object_endArray,
- Object_newInteger, Object_newLong, Object_newUnsignedLong,
- Object_newDouble,
- Object_releaseObject, PyObject_Malloc, PyObject_Free,
- PyObject_Realloc};
-
- dec.preciseFloat = 0;
- dec.prv = NULL;
-
- pyDecoder.dec = dec;
- pyDecoder.curdim = 0;
- pyDecoder.npyarr = NULL;
- pyDecoder.npyarr_addr = NULL;
-
- decoder = (JSONObjectDecoder *)&pyDecoder;
-
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg,
- &opreciseFloat, &labelled,
- PyArray_DescrConverter2, &dtype)) {
- Npy_releaseContext(pyDecoder.npyarr);
- return NULL;
- }
+ char *kwlist[] = {"obj", "precise_float", NULL};
+ char *buf;
+ Py_ssize_t len;
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|b", kwlist, &buf, &len,
+ &dec.preciseFloat)) {
+ return NULL;
+ }
- if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) {
- decoder->preciseFloat = 1;
- }
+ PyObject *ret = JSON_DecodeObject(&dec, buf, len);
- if (PyBytes_Check(arg)) {
- sarg = arg;
- } else if (PyUnicode_Check(arg)) {
- sarg = PyUnicode_AsUTF8String(arg);
- if (sarg == NULL) {
- // Exception raised above us by codec according to docs
- return NULL;
- }
- } else {
- PyErr_Format(PyExc_TypeError, "Expected 'str' or 'bytes'");
- return NULL;
+ if (PyErr_Occurred()) {
+ if (ret) {
+ Py_DECREF((PyObject *)ret);
}
+ return NULL;
+ }
- decoder->errorStr = NULL;
- decoder->errorOffset = NULL;
-
- ret = JSON_DecodeObject(decoder, PyBytes_AS_STRING(sarg),
- PyBytes_GET_SIZE(sarg));
+ if (dec.errorStr) {
+ /*
+ FIXME: It's possible to give a much nicer error message here with actual
+ failing element in input etc*/
- if (sarg != arg) {
- Py_DECREF(sarg);
- }
+ PyErr_Format(PyExc_ValueError, "%s", dec.errorStr);
- if (PyErr_Occurred()) {
- if (ret) {
- Py_DECREF((PyObject *)ret);
- }
- Npy_releaseContext(pyDecoder.npyarr);
- return NULL;
+ if (ret) {
+ Py_DECREF((PyObject *)ret);
}
- if (decoder->errorStr) {
- /*
- FIXME: It's possible to give a much nicer error message here with actual
- failing element in input etc*/
-
- PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr);
-
- if (ret) {
- Py_DECREF((PyObject *)ret);
- }
- Npy_releaseContext(pyDecoder.npyarr);
-
- return NULL;
- }
+ return NULL;
+ }
- return ret;
+ return ret;
}
diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c
index 1fa82215179a8..8bba95dd456de 100644
--- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c
+++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c
@@ -36,19 +36,20 @@ Numeric decoder derived from TCL library
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
+// Licence at LICENSES/ULTRAJSON_LICENSE
+
#define PY_SSIZE_T_CLEAN
#include
-#include
#define NO_IMPORT_ARRAY
#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
+#include "datetime.h"
+#include "pandas/datetime/pd_datetime.h"
+#include "pandas/vendored/ujson/lib/ultrajson.h"
#include
#include
#include
#include
-#include "pandas/vendored/ujson/lib/ultrajson.h"
-#include "datetime.h"
-#include "pandas/datetime/pd_datetime.h"
npy_int64 get_nat(void) { return NPY_MIN_INT64; }
@@ -63,344 +64,314 @@ int object_is_nat_type(PyObject *obj);
int object_is_na_type(PyObject *obj);
typedef struct __NpyArrContext {
- PyObject *array;
- char *dataptr;
- int curdim; // current dimension in array's order
- int stridedim; // dimension we are striding over
- int inc; // stride dimension increment (+/- 1)
- npy_intp dim;
- npy_intp stride;
- npy_intp ndim;
- npy_intp index[NPY_MAXDIMS];
- int type_num;
- PyArray_GetItemFunc *getitem;
-
- char **rowLabels;
- char **columnLabels;
+ PyObject *array;
+ char *dataptr;
+ npy_intp curdim; // current dimension in array's order
+ npy_intp stridedim; // dimension we are striding over
+ int inc; // stride dimension increment (+/- 1)
+ npy_intp dim;
+ npy_intp stride;
+ npy_intp ndim;
+ npy_intp index[NPY_MAXDIMS];
+ int type_num;
+ PyArray_GetItemFunc *getitem;
+
+ char **rowLabels;
+ char **columnLabels;
} NpyArrContext;
typedef struct __PdBlockContext {
- int colIdx;
- int ncols;
- int transpose;
+ Py_ssize_t colIdx;
+ Py_ssize_t ncols;
+ int transpose;
- NpyArrContext **npyCtxts; // NpyArrContext for each column
+ NpyArrContext **npyCtxts; // NpyArrContext for each column
} PdBlockContext;
typedef struct __TypeContext {
- JSPFN_ITERBEGIN iterBegin;
- JSPFN_ITEREND iterEnd;
- JSPFN_ITERNEXT iterNext;
- JSPFN_ITERGETNAME iterGetName;
- JSPFN_ITERGETVALUE iterGetValue;
- PFN_PyTypeToUTF8 PyTypeToUTF8;
- PyObject *newObj;
- PyObject *dictObj;
- Py_ssize_t index;
- Py_ssize_t size;
- PyObject *itemValue;
- PyObject *itemName;
- PyObject *attrList;
- PyObject *iterator;
-
- double doubleValue;
- JSINT64 longValue;
-
- char *cStr;
- NpyArrContext *npyarr;
- PdBlockContext *pdblock;
- int transpose;
- char **rowLabels;
- char **columnLabels;
- npy_intp rowLabelsLen;
- npy_intp columnLabelsLen;
+ JSPFN_ITERBEGIN iterBegin;
+ JSPFN_ITEREND iterEnd;
+ JSPFN_ITERNEXT iterNext;
+ JSPFN_ITERGETNAME iterGetName;
+ JSPFN_ITERGETVALUE iterGetValue;
+ PFN_PyTypeToUTF8 PyTypeToUTF8;
+ PyObject *newObj;
+ PyObject *dictObj;
+ Py_ssize_t index;
+ Py_ssize_t size;
+ PyObject *itemValue;
+ PyObject *itemName;
+ PyObject *attrList;
+ PyObject *iterator;
+
+ double doubleValue;
+ JSINT64 longValue;
+
+ char *cStr;
+ NpyArrContext *npyarr;
+ PdBlockContext *pdblock;
+ int transpose;
+ char **rowLabels;
+ char **columnLabels;
+ npy_intp rowLabelsLen;
+ npy_intp columnLabelsLen;
} TypeContext;
typedef struct __PyObjectEncoder {
- JSONObjectEncoder enc;
+ JSONObjectEncoder enc;
- // pass through the NpyArrContext when encoding multi-dimensional arrays
- NpyArrContext *npyCtxtPassthru;
+ // pass through the NpyArrContext when encoding multi-dimensional arrays
+ NpyArrContext *npyCtxtPassthru;
- // pass through the PdBlockContext when encoding blocks
- PdBlockContext *blkCtxtPassthru;
+ // pass through the PdBlockContext when encoding blocks
+ PdBlockContext *blkCtxtPassthru;
- // pass-through to encode numpy data directly
- int npyType;
- void *npyValue;
+ // pass-through to encode numpy data directly
+ int npyType;
+ void *npyValue;
- int datetimeIso;
- NPY_DATETIMEUNIT datetimeUnit;
- NPY_DATETIMEUNIT valueUnit;
+ int datetimeIso;
+ NPY_DATETIMEUNIT datetimeUnit;
+ NPY_DATETIMEUNIT valueUnit;
- // output format style for pandas data types
- int outputFormat;
- int originalOutputFormat;
+ // output format style for pandas data types
+ int outputFormat;
+ int originalOutputFormat;
- PyObject *defaultHandler;
+ PyObject *defaultHandler;
} PyObjectEncoder;
#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv))
enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES };
-int PdBlock_iterNext(JSOBJ, JSONTypeContext *);
+static int PdBlock_iterNext(JSOBJ, JSONTypeContext *);
static TypeContext *createTypeContext(void) {
- TypeContext *pc;
-
- pc = PyObject_Malloc(sizeof(TypeContext));
- if (!pc) {
- PyErr_NoMemory();
- return NULL;
- }
- pc->newObj = NULL;
- pc->dictObj = NULL;
- pc->itemValue = NULL;
- pc->itemName = NULL;
- pc->attrList = NULL;
- pc->index = 0;
- pc->size = 0;
- pc->longValue = 0;
- pc->doubleValue = 0.0;
- pc->cStr = NULL;
- pc->npyarr = NULL;
- pc->pdblock = NULL;
- pc->rowLabels = NULL;
- pc->columnLabels = NULL;
- pc->transpose = 0;
- pc->rowLabelsLen = 0;
- pc->columnLabelsLen = 0;
-
- return pc;
+ TypeContext *pc = PyObject_Malloc(sizeof(TypeContext));
+ if (!pc) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ pc->newObj = NULL;
+ pc->dictObj = NULL;
+ pc->itemValue = NULL;
+ pc->itemName = NULL;
+ pc->attrList = NULL;
+ pc->index = 0;
+ pc->size = 0;
+ pc->longValue = 0;
+ pc->doubleValue = 0.0;
+ pc->cStr = NULL;
+ pc->npyarr = NULL;
+ pc->pdblock = NULL;
+ pc->rowLabels = NULL;
+ pc->columnLabels = NULL;
+ pc->transpose = 0;
+ pc->rowLabelsLen = 0;
+ pc->columnLabelsLen = 0;
+
+ return pc;
}
static PyObject *get_values(PyObject *obj) {
- PyObject *values = NULL;
-
- if (object_is_index_type(obj) || object_is_series_type(obj)) {
- // The special cases to worry about are dt64tz and category[dt64tz].
- // In both cases we want the UTC-localized datetime64 ndarray,
- // without going through and object array of Timestamps.
- if (PyObject_HasAttrString(obj, "tz")) {
- PyObject *tz = PyObject_GetAttrString(obj, "tz");
- if (tz != Py_None) {
- // Go through object array if we have dt64tz, since tz info will
- // be lost if values is used directly.
- Py_DECREF(tz);
- values = PyObject_CallMethod(obj, "__array__", NULL);
- return values;
- }
- Py_DECREF(tz);
- }
- values = PyObject_GetAttrString(obj, "values");
- if (values == NULL) {
- // Clear so we can subsequently try another method
- PyErr_Clear();
- } else if (PyObject_HasAttrString(values, "__array__")) {
- // We may have gotten a Categorical or Sparse array so call np.array
- PyObject *array_values = PyObject_CallMethod(values, "__array__",
- NULL);
- Py_DECREF(values);
- values = array_values;
- } else if (!PyArray_CheckExact(values)) {
- // Didn't get a numpy array, so keep trying
- Py_DECREF(values);
- values = NULL;
- }
- }
-
+ PyObject *values = NULL;
+
+ if (object_is_index_type(obj) || object_is_series_type(obj)) {
+ // The special cases to worry about are dt64tz and category[dt64tz].
+ // In both cases we want the UTC-localized datetime64 ndarray,
+ // without going through and object array of Timestamps.
+ if (PyObject_HasAttrString(obj, "tz")) {
+ PyObject *tz = PyObject_GetAttrString(obj, "tz");
+ if (tz != Py_None) {
+ // Go through object array if we have dt64tz, since tz info will
+ // be lost if values is used directly.
+ Py_DECREF(tz);
+ values = PyObject_CallMethod(obj, "__array__", NULL);
+ return values;
+ }
+ Py_DECREF(tz);
+ }
+ values = PyObject_GetAttrString(obj, "values");
if (values == NULL) {
- PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj));
- PyObject *repr;
- if (PyObject_HasAttrString(obj, "dtype")) {
- PyObject *dtype = PyObject_GetAttrString(obj, "dtype");
- repr = PyObject_Repr(dtype);
- Py_DECREF(dtype);
- } else {
- repr = PyUnicode_FromString("");
- }
+ // Clear so we can subsequently try another method
+ PyErr_Clear();
+ } else if (PyObject_HasAttrString(values, "__array__")) {
+ // We may have gotten a Categorical or Sparse array so call np.array
+ PyObject *array_values = PyObject_CallMethod(values, "__array__", NULL);
+ Py_DECREF(values);
+ values = array_values;
+ } else if (!PyArray_CheckExact(values)) {
+ // Didn't get a numpy array, so keep trying
+ Py_DECREF(values);
+ values = NULL;
+ }
+ }
+
+ if (values == NULL) {
+ PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj));
+ PyObject *repr;
+ if (PyObject_HasAttrString(obj, "dtype")) {
+ PyObject *dtype = PyObject_GetAttrString(obj, "dtype");
+ repr = PyObject_Repr(dtype);
+ Py_DECREF(dtype);
+ } else {
+ repr = PyUnicode_FromString("");
+ }
- PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet",
- repr, typeRepr);
- Py_DECREF(repr);
- Py_DECREF(typeRepr);
+ PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet",
+ repr, typeRepr);
+ Py_DECREF(repr);
+ Py_DECREF(typeRepr);
- return NULL;
- }
+ return NULL;
+ }
- return values;
+ return values;
}
static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) {
- PyObject *tmp = PyObject_GetAttrString(obj, attr);
- PyObject *ret;
-
- if (tmp == 0) {
- return 0;
- }
- ret = PyObject_GetAttrString(tmp, subAttr);
- Py_DECREF(tmp);
+ PyObject *tmp = PyObject_GetAttrString(obj, attr);
+ if (tmp == 0) {
+ return 0;
+ }
+ PyObject *ret = PyObject_GetAttrString(tmp, subAttr);
+ Py_DECREF(tmp);
- return ret;
+ return ret;
}
static Py_ssize_t get_attr_length(PyObject *obj, char *attr) {
- PyObject *tmp = PyObject_GetAttrString(obj, attr);
- Py_ssize_t ret;
-
- if (tmp == 0) {
- return 0;
- }
- ret = PyObject_Length(tmp);
- Py_DECREF(tmp);
-
- if (ret == -1) {
- return 0;
- }
-
- return ret;
-}
+ PyObject *tmp = PyObject_GetAttrString(obj, attr);
+ if (tmp == 0) {
+ return 0;
+ }
+ Py_ssize_t ret = PyObject_Length(tmp);
+ Py_DECREF(tmp);
-static int is_simple_frame(PyObject *obj) {
- PyObject *mgr = PyObject_GetAttrString(obj, "_mgr");
- if (!mgr) {
- return 0;
- }
- int ret;
- if (PyObject_HasAttrString(mgr, "blocks")) {
- ret = (get_attr_length(mgr, "blocks") <= 1);
- } else {
- ret = 0;
- }
+ if (ret == -1) {
+ return 0;
+ }
- Py_DECREF(mgr);
- return ret;
+ return ret;
}
static npy_int64 get_long_attr(PyObject *o, const char *attr) {
- // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT
+ // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT
- npy_int64 long_val;
- PyObject *value = PyObject_GetAttrString(o, attr);
- long_val =
- (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value));
+ PyObject *value = PyObject_GetAttrString(o, attr);
+ const npy_int64 long_val =
+ (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value));
- Py_DECREF(value);
+ Py_DECREF(value);
- if (object_is_nat_type(o)) {
- // i.e. o is NaT, long_val will be NPY_MIN_INT64
- return long_val;
- }
-
- // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit
- PyObject* reso = PyObject_GetAttrString(o, "_creso");
- if (!PyLong_Check(reso)) {
- // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139
- Py_DECREF(reso);
- return -1;
- }
+ if (object_is_nat_type(o)) {
+ // i.e. o is NaT, long_val will be NPY_MIN_INT64
+ return long_val;
+ }
- long cReso = PyLong_AsLong(reso);
+ // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit
+ PyObject *reso = PyObject_GetAttrString(o, "_creso");
+ if (!PyLong_Check(reso)) {
+ // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139
Py_DECREF(reso);
- if (cReso == -1 && PyErr_Occurred()) {
- return -1;
- }
+ return -1;
+ }
- if (cReso == NPY_FR_us) {
- long_val = long_val * 1000L;
- } else if (cReso == NPY_FR_ms) {
- long_val = long_val * 1000000L;
- } else if (cReso == NPY_FR_s) {
- long_val = long_val * 1000000000L;
- }
+ long cReso = PyLong_AsLong(reso);
+ Py_DECREF(reso);
+ if (cReso == -1 && PyErr_Occurred()) {
+ return -1;
+ }
- return long_val;
+ if (cReso == NPY_FR_us) {
+ return long_val * 1000L;
+ } else if (cReso == NPY_FR_ms) {
+ return long_val * 1000000L;
+ } else if (cReso == NPY_FR_s) {
+ return long_val * 1000000000L;
+ }
+
+ return long_val;
}
static npy_float64 total_seconds(PyObject *td) {
- npy_float64 double_val;
- PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL);
- double_val = PyFloat_AS_DOUBLE(value);
- Py_DECREF(value);
- return double_val;
+ PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL);
+ const npy_float64 double_val = PyFloat_AS_DOUBLE(value);
+ Py_DECREF(value);
+ return double_val;
}
static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
size_t *_outLen) {
- PyObject *obj = (PyObject *)_obj;
- *_outLen = PyBytes_GET_SIZE(obj);
- return PyBytes_AS_STRING(obj);
-}
-
-static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
- size_t *_outLen) {
- char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj,
- (Py_ssize_t *)_outLen);
- if (encoded == NULL) {
- /* Something went wrong.
- Set errorMsg(to tell encoder to stop),
- and let Python exception propagate. */
- JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
- enc->errorMsg = "Encoding failed.";
- }
- return encoded;
+ PyObject *obj = (PyObject *)_obj;
+ *_outLen = PyBytes_GET_SIZE(obj);
+ return PyBytes_AS_STRING(obj);
+}
+
+static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) {
+ char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
+ if (encoded == NULL) {
+ /* Something went wrong.
+ Set errorMsg(to tell encoder to stop),
+ and let Python exception propagate. */
+ JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
+ enc->errorMsg = "Encoding failed.";
+ }
+ return encoded;
}
/* JSON callback. returns a char* and mutates the pointer to *len */
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
- NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit;
- GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len);
- return GET_TC(tc)->cStr;
+ NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+ NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit;
+ GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len);
+ return GET_TC(tc)->cStr;
}
/* JSON callback. returns a char* and mutates the pointer to *len */
static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
- GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len);
- return GET_TC(tc)->cStr;
+ GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len);
+ return GET_TC(tc)->cStr;
}
/* JSON callback */
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
size_t *len) {
- if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) {
- PyErr_SetString(PyExc_TypeError, "Expected date or datetime object");
- ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- return NULL;
- }
+ if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) {
+ PyErr_SetString(PyExc_TypeError, "Expected date or datetime object");
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ return NULL;
+ }
- NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- return PyDateTimeToIso(obj, base, len);
+ NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+ return PyDateTimeToIso(obj, base, len);
}
static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) {
- PyObject *obj = (PyObject *)_obj;
- PyObject *str;
- PyObject *tmp;
-
- str = PyObject_CallMethod(obj, "isoformat", NULL);
- if (str == NULL) {
- *outLen = 0;
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError, "Failed to convert time");
- }
- ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- return NULL;
- }
- if (PyUnicode_Check(str)) {
- tmp = str;
- str = PyUnicode_AsUTF8String(str);
- Py_DECREF(tmp);
+ PyObject *obj = (PyObject *)_obj;
+ PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL);
+ if (str == NULL) {
+ *outLen = 0;
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError, "Failed to convert time");
}
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ return NULL;
+ }
+ if (PyUnicode_Check(str)) {
+ PyObject *tmp = str;
+ str = PyUnicode_AsUTF8String(str);
+ Py_DECREF(tmp);
+ }
- GET_TC(tc)->newObj = str;
+ GET_TC(tc)->newObj = str;
- *outLen = PyBytes_GET_SIZE(str);
- char *outValue = PyBytes_AS_STRING(str);
- return outValue;
+ *outLen = PyBytes_GET_SIZE(str);
+ char *outValue = PyBytes_AS_STRING(str);
+ return outValue;
}
//=============================================================================
@@ -408,167 +379,161 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) {
//=============================================================================
static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->npyarr &&
- GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) {
- Py_XDECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
+ if (GET_TC(tc)->npyarr &&
+ GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) {
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
}
-int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), JSONTypeContext *Py_UNUSED(tc)) {
- return 0;
+static int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj),
+ JSONTypeContext *Py_UNUSED(tc)) {
+ return 0;
}
-void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
- PyArrayObject *obj;
- NpyArrContext *npyarr;
-
- if (GET_TC(tc)->newObj) {
- obj = (PyArrayObject *)GET_TC(tc)->newObj;
- } else {
- obj = (PyArrayObject *)_obj;
- }
-
- npyarr = PyObject_Malloc(sizeof(NpyArrContext));
- GET_TC(tc)->npyarr = npyarr;
-
- if (!npyarr) {
- PyErr_NoMemory();
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
-
- npyarr->array = (PyObject *)obj;
- npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem;
- npyarr->dataptr = PyArray_DATA(obj);
- npyarr->ndim = PyArray_NDIM(obj) - 1;
- npyarr->curdim = 0;
- npyarr->type_num = PyArray_DESCR(obj)->type_num;
-
- if (GET_TC(tc)->transpose) {
- npyarr->dim = PyArray_DIM(obj, npyarr->ndim);
- npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim);
- npyarr->stridedim = npyarr->ndim;
- npyarr->index[npyarr->ndim] = 0;
- npyarr->inc = -1;
- } else {
- npyarr->dim = PyArray_DIM(obj, 0);
- npyarr->stride = PyArray_STRIDE(obj, 0);
- npyarr->stridedim = 0;
- npyarr->index[0] = 0;
- npyarr->inc = 1;
- }
-
- npyarr->columnLabels = GET_TC(tc)->columnLabels;
- npyarr->rowLabels = GET_TC(tc)->rowLabels;
-}
+static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
+ PyArrayObject *obj =
+ (PyArrayObject *)(GET_TC(tc)->newObj ? GET_TC(tc)->newObj : _obj);
-void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ NpyArrContext *npyarr = PyObject_Malloc(sizeof(NpyArrContext));
+ GET_TC(tc)->npyarr = npyarr;
- if (npyarr) {
- NpyArr_freeItemValue(obj, tc);
- PyObject_Free(npyarr);
- }
+ if (!npyarr) {
+ PyErr_NoMemory();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ return;
+ }
+
+ npyarr->array = (PyObject *)obj;
+ npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem;
+ npyarr->dataptr = PyArray_DATA(obj);
+ npyarr->ndim = PyArray_NDIM(obj) - 1;
+ npyarr->curdim = 0;
+ npyarr->type_num = PyArray_DESCR(obj)->type_num;
+
+ if (GET_TC(tc)->transpose) {
+ npyarr->dim = PyArray_DIM(obj, npyarr->ndim);
+ npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim);
+ npyarr->stridedim = npyarr->ndim;
+ npyarr->index[npyarr->ndim] = 0;
+ npyarr->inc = -1;
+ } else {
+ npyarr->dim = PyArray_DIM(obj, 0);
+ npyarr->stride = PyArray_STRIDE(obj, 0);
+ npyarr->stridedim = 0;
+ npyarr->index[0] = 0;
+ npyarr->inc = 1;
+ }
+
+ npyarr->columnLabels = GET_TC(tc)->columnLabels;
+ npyarr->rowLabels = GET_TC(tc)->rowLabels;
+}
+
+static void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+
+ if (npyarr) {
+ NpyArr_freeItemValue(obj, tc);
+ PyObject_Free(npyarr);
+ }
}
-void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj),
- JSONTypeContext *Py_UNUSED(tc)) {}
+static void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *Py_UNUSED(tc)) {}
-void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
- // finished this dimension, reset the data pointer
- npyarr->curdim--;
- npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim];
- npyarr->stridedim -= npyarr->inc;
- npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
- npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
- npyarr->dataptr += npyarr->stride;
+static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ // finished this dimension, reset the data pointer
+ npyarr->curdim--;
+ npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim];
+ npyarr->stridedim -= npyarr->inc;
+ npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
+ npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
+ npyarr->dataptr += npyarr->stride;
- NpyArr_freeItemValue(obj, tc);
+ NpyArr_freeItemValue(obj, tc);
}
-int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
- if (PyErr_Occurred()) {
- return 0;
- }
+ if (PyErr_Occurred()) {
+ return 0;
+ }
- if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
- return 0;
- }
+ if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
+ return 0;
+ }
- NpyArr_freeItemValue(obj, tc);
+ NpyArr_freeItemValue(obj, tc);
- if (PyArray_ISDATETIME(npyarr->array)) {
- GET_TC(tc)->itemValue = obj;
- Py_INCREF(obj);
- ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
- // Also write the resolution (unit) of the ndarray
- PyArray_Descr *dtype = PyArray_DESCR(npyarr->array);
- ((PyObjectEncoder *)tc->encoder)->valueUnit =
- get_datetime_metadata_from_dtype(dtype).base;
- ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
- ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
- } else {
- GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array);
- }
+ if (PyArray_ISDATETIME(npyarr->array)) {
+ GET_TC(tc)->itemValue = obj;
+ Py_INCREF(obj);
+ ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
+ // Also write the resolution (unit) of the ndarray
+ PyArray_Descr *dtype = PyArray_DESCR(npyarr->array);
+ ((PyObjectEncoder *)tc->encoder)->valueUnit =
+ get_datetime_metadata_from_dtype(dtype).base;
+ ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
+ ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
+ } else {
+ GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array);
+ }
- npyarr->dataptr += npyarr->stride;
- npyarr->index[npyarr->stridedim]++;
- return 1;
+ npyarr->dataptr += npyarr->stride;
+ npyarr->index[npyarr->stridedim]++;
+ return 1;
}
-int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
- if (PyErr_Occurred()) {
- return 0;
- }
+ if (PyErr_Occurred()) {
+ return 0;
+ }
- if (npyarr->curdim >= npyarr->ndim ||
- npyarr->index[npyarr->stridedim] >= npyarr->dim) {
- // innermost dimension, start retrieving item values
- GET_TC(tc)->iterNext = NpyArr_iterNextItem;
- return NpyArr_iterNextItem(_obj, tc);
- }
+ if (npyarr->curdim >= npyarr->ndim ||
+ npyarr->index[npyarr->stridedim] >= npyarr->dim) {
+ // innermost dimension, start retrieving item values
+ GET_TC(tc)->iterNext = NpyArr_iterNextItem;
+ return NpyArr_iterNextItem(_obj, tc);
+ }
- // dig a dimension deeper
- npyarr->index[npyarr->stridedim]++;
+ // dig a dimension deeper
+ npyarr->index[npyarr->stridedim]++;
- npyarr->curdim++;
- npyarr->stridedim += npyarr->inc;
- npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
- npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
- npyarr->index[npyarr->stridedim] = 0;
+ npyarr->curdim++;
+ npyarr->stridedim += npyarr->inc;
+ npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
+ npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
+ npyarr->index[npyarr->stridedim] = 0;
- ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
- GET_TC(tc)->itemValue = npyarr->array;
- return 1;
+ ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
+ GET_TC(tc)->itemValue = npyarr->array;
+ return 1;
}
-JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
- npy_intp idx;
- char *cStr;
+static char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
+ size_t *outLen) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ char *cStr;
- if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
- idx = npyarr->index[npyarr->stridedim] - 1;
- cStr = npyarr->columnLabels[idx];
- } else {
- idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1;
- cStr = npyarr->rowLabels[idx];
- }
+ if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
+ const npy_intp idx = npyarr->index[npyarr->stridedim] - 1;
+ cStr = npyarr->columnLabels[idx];
+ } else {
+ const npy_intp idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1;
+ cStr = npyarr->rowLabels[idx];
+ }
- *outLen = strlen(cStr);
+ *outLen = strlen(cStr);
- return cStr;
+ return cStr;
}
//=============================================================================
@@ -580,301 +545,289 @@ char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
// Uses a dedicated NpyArrContext for each column.
//=============================================================================
-void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+static void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- if (blkCtxt->transpose) {
- blkCtxt->colIdx++;
- } else {
- blkCtxt->colIdx = 0;
- }
+ if (blkCtxt->transpose) {
+ blkCtxt->colIdx++;
+ } else {
+ blkCtxt->colIdx = 0;
+ }
- NpyArr_freeItemValue(obj, tc);
+ NpyArr_freeItemValue(obj, tc);
}
-int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- if (blkCtxt->colIdx >= blkCtxt->ncols) {
- return 0;
- }
+ if (blkCtxt->colIdx >= blkCtxt->ncols) {
+ return 0;
+ }
- GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
- blkCtxt->colIdx++;
- return NpyArr_iterNextItem(obj, tc);
+ GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
+ blkCtxt->colIdx++;
+ return NpyArr_iterNextItem(obj, tc);
}
-char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- NpyArrContext *npyarr = blkCtxt->npyCtxts[0];
- npy_intp idx;
- char *cStr;
+static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
+ size_t *outLen) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ NpyArrContext *npyarr = blkCtxt->npyCtxts[0];
+ char *cStr;
- if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) {
- idx = blkCtxt->colIdx - 1;
- cStr = npyarr->columnLabels[idx];
- } else {
- idx = GET_TC(tc)->iterNext != PdBlock_iterNext
- ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1
- : npyarr->index[npyarr->stridedim];
+ if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) {
+ const npy_intp idx = blkCtxt->colIdx - 1;
+ cStr = npyarr->columnLabels[idx];
+ } else {
+ const npy_intp idx =
+ GET_TC(tc)->iterNext != PdBlock_iterNext
+ ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1
+ : npyarr->index[npyarr->stridedim];
- cStr = npyarr->rowLabels[idx];
- }
+ cStr = npyarr->rowLabels[idx];
+ }
- *outLen = strlen(cStr);
- return cStr;
+ *outLen = strlen(cStr);
+ return cStr;
}
-char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
- npy_intp idx;
- char *cStr;
+static char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *tc,
+ size_t *outLen) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
+ char *cStr;
- if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
- idx = npyarr->index[npyarr->stridedim] - 1;
- cStr = npyarr->columnLabels[idx];
- } else {
- idx = blkCtxt->colIdx;
- cStr = npyarr->rowLabels[idx];
- }
+ if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
+ const npy_intp idx = npyarr->index[npyarr->stridedim] - 1;
+ cStr = npyarr->columnLabels[idx];
+ } else {
+ const npy_intp idx = blkCtxt->colIdx;
+ cStr = npyarr->rowLabels[idx];
+ }
- *outLen = strlen(cStr);
- return cStr;
+ *outLen = strlen(cStr);
+ return cStr;
}
-int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- NpyArrContext *npyarr;
+static int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
- return 0;
- }
+ if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
+ return 0;
+ }
- if (blkCtxt->transpose) {
- if (blkCtxt->colIdx >= blkCtxt->ncols) {
- return 0;
- }
- } else {
- npyarr = blkCtxt->npyCtxts[0];
- if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
- return 0;
- }
+ if (blkCtxt->transpose) {
+ if (blkCtxt->colIdx >= blkCtxt->ncols) {
+ return 0;
}
+ } else {
+ const NpyArrContext *npyarr = blkCtxt->npyCtxts[0];
+ if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
+ return 0;
+ }
+ }
- ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt;
- GET_TC(tc)->itemValue = obj;
+ ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt;
+ GET_TC(tc)->itemValue = obj;
- return 1;
+ return 1;
}
-void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+static void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- if (blkCtxt->transpose) {
- // if transposed we exhaust each column before moving to the next
- GET_TC(tc)->iterNext = NpyArr_iterNextItem;
- GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose;
- GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
- }
+ if (blkCtxt->transpose) {
+ // if transposed we exhaust each column before moving to the next
+ GET_TC(tc)->iterNext = NpyArr_iterNextItem;
+ GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose;
+ GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
+ }
}
-void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
- PyObject *obj, *values, *arrays, *array;
- PdBlockContext *blkCtxt;
- NpyArrContext *npyarr;
- Py_ssize_t i;
+static void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
+ PyObject *obj = (PyObject *)_obj;
- obj = (PyObject *)_obj;
+ GET_TC(tc)->iterGetName = GET_TC(tc)->transpose
+ ? PdBlock_iterGetName_Transpose
+ : PdBlock_iterGetName;
- GET_TC(tc)->iterGetName = GET_TC(tc)->transpose
- ? PdBlock_iterGetName_Transpose
- : PdBlock_iterGetName;
+ PdBlockContext *blkCtxt = PyObject_Malloc(sizeof(PdBlockContext));
+ if (!blkCtxt) {
+ PyErr_NoMemory();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ return;
+ }
+ GET_TC(tc)->pdblock = blkCtxt;
- blkCtxt = PyObject_Malloc(sizeof(PdBlockContext));
- if (!blkCtxt) {
- PyErr_NoMemory();
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
- GET_TC(tc)->pdblock = blkCtxt;
+ blkCtxt->colIdx = 0;
+ blkCtxt->transpose = GET_TC(tc)->transpose;
+ blkCtxt->ncols = get_attr_length(obj, "columns");
- blkCtxt->colIdx = 0;
- blkCtxt->transpose = GET_TC(tc)->transpose;
- blkCtxt->ncols = get_attr_length(obj, "columns");
+ if (blkCtxt->ncols == 0) {
+ blkCtxt->npyCtxts = NULL;
- if (blkCtxt->ncols == 0) {
- blkCtxt->npyCtxts = NULL;
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ return;
+ }
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
+ blkCtxt->npyCtxts = PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols);
+ if (!blkCtxt->npyCtxts) {
+ PyErr_NoMemory();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ return;
+ }
- blkCtxt->npyCtxts =
- PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols);
- if (!blkCtxt->npyCtxts) {
- PyErr_NoMemory();
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
+ PyObject *arrays = get_sub_attr(obj, "_mgr", "column_arrays");
+ if (!arrays) {
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ return;
+ }
- arrays = get_sub_attr(obj, "_mgr", "column_arrays");
- if (!arrays) {
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
+ for (Py_ssize_t i = 0; i < PyObject_Length(arrays); i++) {
+ PyObject *array = PyList_GET_ITEM(arrays, i);
+ if (!array) {
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto ARR_RET;
}
- for (i = 0; i < PyObject_Length(arrays); i++) {
- array = PyList_GET_ITEM(arrays, i);
- if (!array) {
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- goto ARR_RET;
- }
-
- // ensure we have a numpy array (i.e. np.asarray)
- values = PyObject_CallMethod(array, "__array__", NULL);
- if ((!values) || (!PyArray_CheckExact(values))) {
- // Didn't get a numpy array
- ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- goto ARR_RET;
- }
+ // ensure we have a numpy array (i.e. np.asarray)
+ PyObject *values = PyObject_CallMethod(array, "__array__", NULL);
+ if ((!values) || (!PyArray_CheckExact(values))) {
+ // Didn't get a numpy array
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto ARR_RET;
+ }
- GET_TC(tc)->newObj = values;
+ GET_TC(tc)->newObj = values;
- // init a dedicated context for this column
- NpyArr_iterBegin(obj, tc);
- npyarr = GET_TC(tc)->npyarr;
+ // init a dedicated context for this column
+ NpyArr_iterBegin(obj, tc);
- GET_TC(tc)->itemValue = NULL;
- ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
+ GET_TC(tc)->itemValue = NULL;
+ ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
- blkCtxt->npyCtxts[i] = npyarr;
- GET_TC(tc)->newObj = NULL;
- }
- GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
- goto ARR_RET;
+ blkCtxt->npyCtxts[i] = GET_TC(tc)->npyarr;
+ GET_TC(tc)->newObj = NULL;
+ }
+ GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
+ goto ARR_RET;
ARR_RET:
- Py_DECREF(arrays);
+ Py_DECREF(arrays);
}
-void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt;
- NpyArrContext *npyarr;
- int i;
-
- GET_TC(tc)->itemValue = NULL;
- npyarr = GET_TC(tc)->npyarr;
+static void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->itemValue = NULL;
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- blkCtxt = GET_TC(tc)->pdblock;
-
- if (blkCtxt) {
- for (i = 0; i < blkCtxt->ncols; i++) {
- npyarr = blkCtxt->npyCtxts[i];
- if (npyarr) {
- if (npyarr->array) {
- Py_DECREF(npyarr->array);
- npyarr->array = NULL;
- }
+ if (blkCtxt) {
+ for (int i = 0; i < blkCtxt->ncols; i++) {
+ npyarr = blkCtxt->npyCtxts[i];
+ if (npyarr) {
+ if (npyarr->array) {
+ Py_DECREF(npyarr->array);
+ npyarr->array = NULL;
+ }
- GET_TC(tc)->npyarr = npyarr;
- NpyArr_iterEnd(obj, tc);
+ GET_TC(tc)->npyarr = npyarr;
+ NpyArr_iterEnd(obj, tc);
- blkCtxt->npyCtxts[i] = NULL;
- }
- }
+ blkCtxt->npyCtxts[i] = NULL;
+ }
+ }
- if (blkCtxt->npyCtxts) {
- PyObject_Free(blkCtxt->npyCtxts);
- }
- PyObject_Free(blkCtxt);
+ if (blkCtxt->npyCtxts) {
+ PyObject_Free(blkCtxt->npyCtxts);
}
+ PyObject_Free(blkCtxt);
+ }
}
//=============================================================================
// Tuple iteration functions
// itemValue is borrowed reference, no ref counting
//=============================================================================
-void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
- GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj);
- GET_TC(tc)->itemValue = NULL;
+static void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj);
+ GET_TC(tc)->itemValue = NULL;
}
-int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- PyObject *item;
+static int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- if (GET_TC(tc)->index >= GET_TC(tc)->size) {
- return 0;
- }
+ if (GET_TC(tc)->index >= GET_TC(tc)->size) {
+ return 0;
+ }
- item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index);
+ PyObject *item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index);
- GET_TC(tc)->itemValue = item;
- GET_TC(tc)->index++;
- return 1;
+ GET_TC(tc)->itemValue = item;
+ GET_TC(tc)->index++;
+ return 1;
}
-void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {}
+static void Tuple_iterEnd(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *Py_UNUSED(tc)) {}
-JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
- size_t *Py_UNUSED(outLen)) {
- return NULL;
+static char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *Py_UNUSED(tc),
+ size_t *Py_UNUSED(outLen)) {
+ return NULL;
}
//=============================================================================
// Set iteration functions
// itemValue is borrowed reference, no ref counting
//=============================================================================
-void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->itemValue = NULL;
- GET_TC(tc)->iterator = PyObject_GetIter(obj);
+static void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->itemValue = NULL;
+ GET_TC(tc)->iterator = PyObject_GetIter(obj);
}
-int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObject *item;
-
- if (GET_TC(tc)->itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
+static int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
- item = PyIter_Next(GET_TC(tc)->iterator);
+ PyObject *item = PyIter_Next(GET_TC(tc)->iterator);
- if (item == NULL) {
- return 0;
- }
+ if (item == NULL) {
+ return 0;
+ }
- GET_TC(tc)->itemValue = item;
- return 1;
+ GET_TC(tc)->itemValue = item;
+ return 1;
}
-void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
+static void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
- if (GET_TC(tc)->iterator) {
- Py_DECREF(GET_TC(tc)->iterator);
- GET_TC(tc)->iterator = NULL;
- }
+ if (GET_TC(tc)->iterator) {
+ Py_DECREF(GET_TC(tc)->iterator);
+ GET_TC(tc)->iterator = NULL;
+ }
}
-JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
- size_t *Py_UNUSED(outLen)) {
- return NULL;
+static char *Set_iterGetName(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *Py_UNUSED(tc),
+ size_t *Py_UNUSED(outLen)) {
+ return NULL;
}
//=============================================================================
@@ -882,294 +835,285 @@ char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
// itemName ref is borrowed from PyObject_Dir (attrList). No refcount
// itemValue ref is from PyObject_GetAttr. Ref counted
//=============================================================================
-void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->attrList = PyObject_Dir(obj);
- GET_TC(tc)->index = 0;
- GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList);
+static void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->attrList = PyObject_Dir(obj);
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList);
}
-void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
+static void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
- if (GET_TC(tc)->itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = NULL;
- }
+ if (GET_TC(tc)->itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = NULL;
+ }
- Py_DECREF((PyObject *)GET_TC(tc)->attrList);
+ Py_DECREF((PyObject *)GET_TC(tc)->attrList);
}
-int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
- PyObject *obj = (PyObject *)_obj;
- PyObject *itemValue = GET_TC(tc)->itemValue;
- PyObject *itemName = GET_TC(tc)->itemName;
- PyObject *attr;
- PyObject *attrName;
- char *attrStr;
+static int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
+ PyObject *obj = (PyObject *)_obj;
+ PyObject *itemValue = GET_TC(tc)->itemValue;
+ PyObject *itemName = GET_TC(tc)->itemName;
- if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
- return 0;
- }
-
- if (itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = itemValue = NULL;
- }
+ if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
+ return 0;
+ }
- if (itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = itemName = NULL;
- }
+ if (itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = itemValue = NULL;
+ }
- for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) {
- attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index);
- attr = PyUnicode_AsUTF8String(attrName);
- attrStr = PyBytes_AS_STRING(attr);
+ if (itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = itemName = NULL;
+ }
- if (attrStr[0] == '_') {
- Py_DECREF(attr);
- continue;
- }
+ for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) {
+ PyObject *attrName =
+ PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index);
+ PyObject *attr = PyUnicode_AsUTF8String(attrName);
+ const char *attrStr = PyBytes_AS_STRING(attr);
- itemValue = PyObject_GetAttr(obj, attrName);
- if (itemValue == NULL) {
- PyErr_Clear();
- Py_DECREF(attr);
- continue;
- }
-
- if (PyCallable_Check(itemValue)) {
- Py_DECREF(itemValue);
- Py_DECREF(attr);
- continue;
- }
-
- GET_TC(tc)->itemName = itemName;
- GET_TC(tc)->itemValue = itemValue;
+ if (attrStr[0] == '_') {
+ Py_DECREF(attr);
+ continue;
+ }
- itemName = attr;
- break;
+ itemValue = PyObject_GetAttr(obj, attrName);
+ if (itemValue == NULL) {
+ PyErr_Clear();
+ Py_DECREF(attr);
+ continue;
}
- if (itemName == NULL) {
- GET_TC(tc)->index = GET_TC(tc)->size;
- GET_TC(tc)->itemValue = NULL;
- return 0;
+ if (PyCallable_Check(itemValue)) {
+ Py_DECREF(itemValue);
+ Py_DECREF(attr);
+ continue;
}
GET_TC(tc)->itemName = itemName;
GET_TC(tc)->itemValue = itemValue;
- GET_TC(tc)->index++;
- return 1;
+ itemName = attr;
+ break;
+ }
+
+ if (itemName == NULL) {
+ GET_TC(tc)->index = GET_TC(tc)->size;
+ GET_TC(tc)->itemValue = NULL;
+ return 0;
+ }
+
+ GET_TC(tc)->itemName = itemName;
+ GET_TC(tc)->itemValue = itemValue;
+ GET_TC(tc)->index++;
+
+ return 1;
}
-JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName);
- return PyBytes_AS_STRING(GET_TC(tc)->itemName);
+static char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
+ size_t *outLen) {
+ *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName);
+ return PyBytes_AS_STRING(GET_TC(tc)->itemName);
}
//=============================================================================
// List iteration functions
// itemValue is borrowed from object (which is list). No refcounting
//=============================================================================
-void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
- GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj);
+static void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj);
}
-int List_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- if (GET_TC(tc)->index >= GET_TC(tc)->size) {
- return 0;
- }
+static int List_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ if (GET_TC(tc)->index >= GET_TC(tc)->size) {
+ return 0;
+ }
- GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index);
- GET_TC(tc)->index++;
- return 1;
+ GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index);
+ GET_TC(tc)->index++;
+ return 1;
}
-void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {}
+static void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {
+}
-JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *List_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
- size_t *Py_UNUSED(outLen)) {
- return NULL;
+static char *List_iterGetName(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *Py_UNUSED(tc),
+ size_t *Py_UNUSED(outLen)) {
+ return NULL;
}
//=============================================================================
// pandas Index iteration functions
//=============================================================================
-void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
- GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
- if (!GET_TC(tc)->cStr) {
- PyErr_NoMemory();
- }
+static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
+ if (!GET_TC(tc)->cStr) {
+ PyErr_NoMemory();
+ }
}
-int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- Py_ssize_t index;
- if (!GET_TC(tc)->cStr) {
- return 0;
- }
-
- index = GET_TC(tc)->index;
- Py_XDECREF(GET_TC(tc)->itemValue);
- if (index == 0) {
- memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
- } else if (index == 1) {
- memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
- GET_TC(tc)->itemValue = get_values(obj);
- if (!GET_TC(tc)->itemValue) {
- return 0;
- }
- } else {
- return 0;
- }
+static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ if (!GET_TC(tc)->cStr) {
+ return 0;
+ }
+
+ const Py_ssize_t index = GET_TC(tc)->index;
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ if (index == 0) {
+ memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
+ } else if (index == 1) {
+ memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = get_values(obj);
+ if (!GET_TC(tc)->itemValue) {
+ return 0;
+ }
+ } else {
+ return 0;
+ }
- GET_TC(tc)->index++;
- return 1;
+ GET_TC(tc)->index++;
+ return 1;
}
-void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {}
+static void Index_iterEnd(JSOBJ Py_UNUSED(obj),
+ JSONTypeContext *Py_UNUSED(tc)) {}
-JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = strlen(GET_TC(tc)->cStr);
- return GET_TC(tc)->cStr;
+static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
+ size_t *outLen) {
+ *outLen = strlen(GET_TC(tc)->cStr);
+ return GET_TC(tc)->cStr;
}
//=============================================================================
// pandas Series iteration functions
//=============================================================================
-void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- GET_TC(tc)->index = 0;
- GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
- enc->outputFormat = VALUES; // for contained series
- if (!GET_TC(tc)->cStr) {
- PyErr_NoMemory();
- }
-}
-
-int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- Py_ssize_t index;
- if (!GET_TC(tc)->cStr) {
- return 0;
- }
-
- index = GET_TC(tc)->index;
- Py_XDECREF(GET_TC(tc)->itemValue);
- if (index == 0) {
- memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
- } else if (index == 1) {
- memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
- } else if (index == 2) {
- memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
- GET_TC(tc)->itemValue = get_values(obj);
- if (!GET_TC(tc)->itemValue) {
- return 0;
- }
- } else {
- return 0;
- }
+static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
+ enc->outputFormat = VALUES; // for contained series
+ if (!GET_TC(tc)->cStr) {
+ PyErr_NoMemory();
+ }
+}
+
+static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ if (!GET_TC(tc)->cStr) {
+ return 0;
+ }
+
+ const Py_ssize_t index = GET_TC(tc)->index;
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ if (index == 0) {
+ memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
+ } else if (index == 1) {
+ memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
+ } else if (index == 2) {
+ memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = get_values(obj);
+ if (!GET_TC(tc)->itemValue) {
+ return 0;
+ }
+ } else {
+ return 0;
+ }
- GET_TC(tc)->index++;
- return 1;
+ GET_TC(tc)->index++;
+ return 1;
}
-void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- enc->outputFormat = enc->originalOutputFormat;
+static void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ enc->outputFormat = enc->originalOutputFormat;
}
-JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = strlen(GET_TC(tc)->cStr);
- return GET_TC(tc)->cStr;
+static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
+ size_t *outLen) {
+ *outLen = strlen(GET_TC(tc)->cStr);
+ return GET_TC(tc)->cStr;
}
//=============================================================================
// pandas DataFrame iteration functions
//=============================================================================
-void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- GET_TC(tc)->index = 0;
- GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
- enc->outputFormat = VALUES; // for contained series & index
- if (!GET_TC(tc)->cStr) {
- PyErr_NoMemory();
- }
-}
-
-int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- Py_ssize_t index;
- if (!GET_TC(tc)->cStr) {
- return 0;
- }
-
- index = GET_TC(tc)->index;
- Py_XDECREF(GET_TC(tc)->itemValue);
- if (index == 0) {
- memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns");
- } else if (index == 1) {
- memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
- } else if (index == 2) {
- memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
- if (is_simple_frame(obj)) {
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values");
- if (!GET_TC(tc)->itemValue) {
- return 0;
- }
- } else {
- Py_INCREF(obj);
- GET_TC(tc)->itemValue = obj;
- }
- } else {
- return 0;
- }
+static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
+ enc->outputFormat = VALUES; // for contained series & index
+ if (!GET_TC(tc)->cStr) {
+ PyErr_NoMemory();
+ }
+}
+
+static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ if (!GET_TC(tc)->cStr) {
+ return 0;
+ }
+
+ const Py_ssize_t index = GET_TC(tc)->index;
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ if (index == 0) {
+ memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns");
+ } else if (index == 1) {
+ memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
+ } else if (index == 2) {
+ memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
+ Py_INCREF(obj);
+ GET_TC(tc)->itemValue = obj;
+ } else {
+ return 0;
+ }
- GET_TC(tc)->index++;
- return 1;
+ GET_TC(tc)->index++;
+ return 1;
}
-void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- enc->outputFormat = enc->originalOutputFormat;
+static void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ enc->outputFormat = enc->originalOutputFormat;
}
-JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = strlen(GET_TC(tc)->cStr);
- return GET_TC(tc)->cStr;
+static char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
+ size_t *outLen) {
+ *outLen = strlen(GET_TC(tc)->cStr);
+ return GET_TC(tc)->cStr;
}
//=============================================================================
@@ -1177,63 +1121,59 @@ char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
// itemName might converted to string (Python_Str). Do refCounting
// itemValue is borrowed from object (which is dict). No refCounting
//=============================================================================
-void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
+static void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
}
-int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObject *itemNameTmp;
+static int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = NULL;
+ }
- if (GET_TC(tc)->itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = NULL;
- }
-
- if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index,
- &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) {
- return 0;
- }
+ if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index,
+ &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) {
+ return 0;
+ }
- if (PyUnicode_Check(GET_TC(tc)->itemName)) {
- GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
- } else if (!PyBytes_Check(GET_TC(tc)->itemName)) {
- GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName);
- itemNameTmp = GET_TC(tc)->itemName;
- GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
- Py_DECREF(itemNameTmp);
- } else {
- Py_INCREF(GET_TC(tc)->itemName);
- }
- return 1;
+ if (PyUnicode_Check(GET_TC(tc)->itemName)) {
+ GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
+ } else if (!PyBytes_Check(GET_TC(tc)->itemName)) {
+ GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName);
+ PyObject *itemNameTmp = GET_TC(tc)->itemName;
+ GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
+ Py_DECREF(itemNameTmp);
+ } else {
+ Py_INCREF(GET_TC(tc)->itemName);
+ }
+ return 1;
}
-void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = NULL;
- }
- Py_DECREF(GET_TC(tc)->dictObj);
+static void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = NULL;
+ }
+ Py_DECREF(GET_TC(tc)->dictObj);
}
-JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
+static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
}
-char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName);
- return PyBytes_AS_STRING(GET_TC(tc)->itemName);
+static char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
+ size_t *outLen) {
+ *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName);
+ return PyBytes_AS_STRING(GET_TC(tc)->itemName);
}
-void NpyArr_freeLabels(char **labels, npy_intp len) {
- npy_intp i;
-
- if (labels) {
- for (i = 0; i < len; i++) {
- PyObject_Free(labels[i]);
- }
- PyObject_Free(labels);
+static void NpyArr_freeLabels(char **labels, npy_intp len) {
+ if (labels) {
+ for (npy_intp i = 0; i < len; i++) {
+ PyObject_Free(labels[i]);
}
+ PyObject_Free(labels);
+ }
}
/*
@@ -1253,907 +1193,874 @@ void NpyArr_freeLabels(char **labels, npy_intp len) {
* this has instead just stringified any input save for datetime values,
* which may need to be represented in various formats.
*/
-char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
- npy_intp num) {
- // NOTE this function steals a reference to labels.
- PyObject *item = NULL;
- size_t len;
- npy_intp i, stride;
- char **ret;
- char *dataptr, *cLabel;
- int type_num;
- PyArray_Descr *dtype;
- NPY_DATETIMEUNIT base = enc->datetimeUnit;
-
- if (!labels) {
- return 0;
- }
-
- if (PyArray_SIZE(labels) < num) {
- PyErr_SetString(
- PyExc_ValueError,
- "Label array sizes do not match corresponding data shape");
- Py_DECREF(labels);
- return 0;
- }
+static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
+ npy_intp num) {
+ // NOTE this function steals a reference to labels.
+ PyObject *item = NULL;
+ const NPY_DATETIMEUNIT base = enc->datetimeUnit;
- ret = PyObject_Malloc(sizeof(char *) * num);
- if (!ret) {
- PyErr_NoMemory();
- Py_DECREF(labels);
- return 0;
- }
-
- for (i = 0; i < num; i++) {
- ret[i] = NULL;
- }
-
- stride = PyArray_STRIDE(labels, 0);
- dataptr = PyArray_DATA(labels);
- type_num = PyArray_TYPE(labels);
- dtype = PyArray_DESCR(labels);
+ if (!labels) {
+ return 0;
+ }
- for (i = 0; i < num; i++) {
- item = PyArray_GETITEM(labels, dataptr);
- if (!item) {
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
+ if (PyArray_SIZE(labels) < num) {
+ PyErr_SetString(PyExc_ValueError,
+ "Label array sizes do not match corresponding data shape");
+ Py_DECREF(labels);
+ return 0;
+ }
- int is_datetimelike = 0;
- npy_int64 i8date;
- NPY_DATETIMEUNIT dateUnit = NPY_FR_ns;
- if (PyTypeNum_ISDATETIME(type_num)) {
- is_datetimelike = 1;
- PyArray_VectorUnaryFunc *castfunc =
- PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64);
- if (!castfunc) {
- PyErr_Format(PyExc_ValueError,
- "Cannot cast numpy dtype %d to long",
- enc->npyType);
- }
- castfunc(dataptr, &i8date, 1, NULL, NULL);
- dateUnit = get_datetime_metadata_from_dtype(dtype).base;
- } else if (PyDate_Check(item) || PyDelta_Check(item)) {
- is_datetimelike = 1;
- if (PyObject_HasAttrString(item, "_value")) {
- // see test_date_index_and_values for case with non-nano
- i8date = get_long_attr(item, "_value");
- } else {
- if (PyDelta_Check(item)) {
- i8date = total_seconds(item) *
- 1000000000LL; // nanoseconds per second
- } else {
- // datetime.* objects don't follow above rules
- i8date = PyDateTimeToEpoch(item, NPY_FR_ns);
- }
- }
+ char **ret = PyObject_Malloc(sizeof(char *) * num);
+ if (!ret) {
+ PyErr_NoMemory();
+ Py_DECREF(labels);
+ return 0;
+ }
+
+ for (npy_intp i = 0; i < num; i++) {
+ ret[i] = NULL;
+ }
+
+ const npy_intp stride = PyArray_STRIDE(labels, 0);
+ char *dataptr = PyArray_DATA(labels);
+ const int type_num = PyArray_TYPE(labels);
+ PyArray_Descr *dtype = PyArray_DESCR(labels);
+
+ for (npy_intp i = 0; i < num; i++) {
+ item = PyArray_GETITEM(labels, dataptr);
+ if (!item) {
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
+ }
+
+ int is_datetimelike = 0;
+ int64_t i8date;
+ NPY_DATETIMEUNIT dateUnit = NPY_FR_ns;
+ if (PyTypeNum_ISDATETIME(type_num)) {
+ is_datetimelike = 1;
+ i8date = *(int64_t *)dataptr;
+ dateUnit = get_datetime_metadata_from_dtype(dtype).base;
+ } else if (PyDate_Check(item) || PyDelta_Check(item)) {
+ is_datetimelike = 1;
+ if (PyObject_HasAttrString(item, "_value")) {
+ // pd.Timestamp object or pd.NaT
+ // see test_date_index_and_values for case with non-nano
+ i8date = get_long_attr(item, "_value");
+ } else {
+ if (PyDelta_Check(item)) {
+ // TODO(anyone): cast below loses precision if total_seconds return
+ // value exceeds number of bits that significand can hold
+ // also liable to overflow
+ i8date = (int64_t)(total_seconds(item) *
+ 1000000000LL); // nanoseconds per second
+ } else {
+ // datetime.* objects don't follow above rules
+ i8date = PyDateTimeToEpoch(item, NPY_FR_ns);
}
+ }
+ }
- if (is_datetimelike) {
- if (i8date == get_nat()) {
- len = 4;
- cLabel = PyObject_Malloc(len + 1);
- strncpy(cLabel, "null", len + 1);
+ size_t len;
+ char *cLabel;
+ if (is_datetimelike) {
+ if (i8date == get_nat()) {
+ len = 4;
+ cLabel = PyObject_Malloc(len + 1);
+ strncpy(cLabel, "null", len + 1);
+ } else {
+ if (enc->datetimeIso) {
+ if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
+ // TODO(username): non-nano timedelta support?
+ cLabel = int64ToIsoDuration(i8date, &len);
+ } else {
+ if (type_num == NPY_DATETIME) {
+ cLabel = int64ToIso(i8date, dateUnit, base, &len);
} else {
- if (enc->datetimeIso) {
- if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
- // TODO(username): non-nano timedelta support?
- cLabel = int64ToIsoDuration(i8date, &len);
- } else {
- if (type_num == NPY_DATETIME) {
- cLabel = int64ToIso(i8date, dateUnit, base, &len);
- } else {
- cLabel = PyDateTimeToIso(item, base, &len);
- }
- }
- if (cLabel == NULL) {
- Py_DECREF(item);
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
- } else {
- int size_of_cLabel = 21; // 21 chars for int 64
- cLabel = PyObject_Malloc(size_of_cLabel);
- snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT,
- NpyDateTimeToEpoch(i8date, base));
- len = strlen(cLabel);
- }
- }
- } else { // Fallback to string representation
- // Replace item with the string to keep it alive.
- Py_SETREF(item, PyObject_Str(item));
- if (item == NULL) {
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
+ cLabel = PyDateTimeToIso(item, base, &len);
}
-
- cLabel = (char *)PyUnicode_AsUTF8(item);
- len = strlen(cLabel);
- }
-
- // Add 1 to include NULL terminator
- ret[i] = PyObject_Malloc(len + 1);
- memcpy(ret[i], cLabel, len + 1);
- Py_DECREF(item);
-
- if (is_datetimelike) {
- PyObject_Free(cLabel);
- }
-
- if (PyErr_Occurred()) {
+ }
+ if (cLabel == NULL) {
+ Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
- }
-
- if (!ret[i]) {
- PyErr_NoMemory();
+ }
+ } else {
+ int size_of_cLabel = 21; // 21 chars for int 64
+ cLabel = PyObject_Malloc(size_of_cLabel);
+ if (scaleNanosecToUnit(&i8date, base) == -1) {
+ NpyArr_freeLabels(ret, num);
ret = 0;
break;
+ }
+ snprintf(cLabel, size_of_cLabel, "%" PRId64, i8date);
+ len = strlen(cLabel);
}
+ }
+ } else { // Fallback to string representation
+ // Replace item with the string to keep it alive.
+ Py_SETREF(item, PyObject_Str(item));
+ if (item == NULL) {
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
+ }
- dataptr += stride;
+ cLabel = (char *)PyUnicode_AsUTF8(item);
+ len = strlen(cLabel);
}
- Py_DECREF(labels);
- return ret;
-}
+ // Add 1 to include NULL terminator
+ ret[i] = PyObject_Malloc(len + 1);
+ memcpy(ret[i], cLabel, len + 1);
+ Py_DECREF(item);
-void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) {
- PyObject *tmpObj = NULL;
- tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL);
- if (!PyErr_Occurred()) {
- if (tmpObj == NULL) {
- PyErr_SetString(PyExc_TypeError,
- "Failed to execute default handler");
- } else {
- encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0);
- }
+ if (is_datetimelike) {
+ PyObject_Free(cLabel);
}
- Py_XDECREF(tmpObj);
- return;
-}
-
-void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
- PyObject *obj, *exc, *toDictFunc, *tmpObj, *values;
- TypeContext *pc;
- PyObjectEncoder *enc;
- double val;
- npy_int64 value;
- int unit;
- tc->prv = NULL;
-
- if (!_obj) {
- tc->type = JT_INVALID;
- return;
+ if (PyErr_Occurred()) {
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
}
- obj = (PyObject *)_obj;
- enc = (PyObjectEncoder *)tc->encoder;
-
- if (PyBool_Check(obj)) {
- tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE;
- return;
- } else if (obj == Py_None) {
- tc->type = JT_NULL;
- return;
+ if (!ret[i]) {
+ PyErr_NoMemory();
+ ret = 0;
+ break;
}
- pc = createTypeContext();
- if (!pc) {
- tc->type = JT_INVALID;
- return;
- }
- tc->prv = pc;
-
- if (PyTypeNum_ISDATETIME(enc->npyType)) {
- int64_t longVal;
- PyArray_VectorUnaryFunc *castfunc =
- PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64);
- if (!castfunc) {
- PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long",
- enc->npyType);
- }
- castfunc(enc->npyValue, &longVal, 1, NULL, NULL);
- if (longVal == get_nat()) {
- tc->type = JT_NULL;
- } else {
- if (enc->datetimeIso) {
- if (enc->npyType == NPY_TIMEDELTA) {
- pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
- } else {
- pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
- }
- // Currently no way to pass longVal to iso function, so use
- // state management
- GET_TC(tc)->longValue = longVal;
- tc->type = JT_UTF8;
- } else {
- NPY_DATETIMEUNIT base =
- ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base);
- tc->type = JT_LONG;
- }
- }
+ dataptr += stride;
+ }
- // TODO(username): this prevents infinite loop with
- // mixed-type DataFrames;
- // refactor
- enc->npyCtxtPassthru = NULL;
- enc->npyType = -1;
- return;
- }
+ Py_DECREF(labels);
+ return ret;
+}
- if (PyIter_Check(obj) ||
- (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) {
- goto ISITERABLE;
+static void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) {
+ PyObject *tmpObj = NULL;
+ tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL);
+ if (!PyErr_Occurred()) {
+ if (tmpObj == NULL) {
+ PyErr_SetString(PyExc_TypeError, "Failed to execute default handler");
+ } else {
+ encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0);
}
+ }
+ Py_XDECREF(tmpObj);
+ return;
+}
- if (PyLong_Check(obj)) {
- tc->type = JT_LONG;
- int overflow = 0;
- GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow);
- int err;
- err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred();
-
- if (overflow) {
- tc->type = JT_BIGNUM;
- } else if (err) {
- goto INVALID;
- }
+static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
+ tc->prv = NULL;
- return;
- } else if (PyFloat_Check(obj)) {
- val = PyFloat_AS_DOUBLE(obj);
- if (npy_isnan(val) || npy_isinf(val)) {
- tc->type = JT_NULL;
- } else {
- GET_TC(tc)->doubleValue = val;
- tc->type = JT_DOUBLE;
- }
- return;
- } else if (PyBytes_Check(obj)) {
- pc->PyTypeToUTF8 = PyBytesToUTF8;
- tc->type = JT_UTF8;
- return;
- } else if (PyUnicode_Check(obj)) {
- pc->PyTypeToUTF8 = PyUnicodeToUTF8;
- tc->type = JT_UTF8;
- return;
- } else if (object_is_decimal_type(obj)) {
- GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj);
- tc->type = JT_DOUBLE;
- return;
- } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) {
- if (object_is_nat_type(obj)) {
- tc->type = JT_NULL;
- return;
- }
+ if (!_obj) {
+ tc->type = JT_INVALID;
+ return;
+ }
- if (enc->datetimeIso) {
- pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
- tc->type = JT_UTF8;
- } else {
- NPY_DATETIMEUNIT base =
- ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
- tc->type = JT_LONG;
- }
- return;
- } else if (PyTime_Check(obj)) {
- pc->PyTypeToUTF8 = PyTimeToJSON;
- tc->type = JT_UTF8;
- return;
- } else if (PyArray_IsScalar(obj, Datetime)) {
- npy_int64 longVal;
- if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) {
- tc->type = JT_NULL;
- return;
- }
- PyArray_Descr *dtype = PyArray_DescrFromScalar(obj);
- if (!PyTypeNum_ISDATETIME(dtype->type_num)) {
- PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime");
- return;
- }
+ PyObject *obj = (PyObject *)_obj;
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64);
- PyArray_CastScalarToCtype(obj, &longVal, outcode);
- Py_DECREF(outcode);
+ if (PyBool_Check(obj)) {
+ tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE;
+ return;
+ } else if (obj == Py_None) {
+ tc->type = JT_NULL;
+ return;
+ }
- if (enc->datetimeIso) {
- GET_TC(tc)->longValue = longVal;
- pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
- enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base;
- tc->type = JT_UTF8;
- } else {
- NPY_DATETIMEUNIT base =
- ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
- tc->type = JT_LONG;
- }
- return;
- } else if (PyDelta_Check(obj)) {
- if (PyObject_HasAttrString(obj, "_value")) {
- value = get_long_attr(obj, "_value");
- } else {
- value = total_seconds(obj) * 1000000000LL; // nanoseconds per sec
- }
+ TypeContext *pc = createTypeContext();
+ if (!pc) {
+ tc->type = JT_INVALID;
+ return;
+ }
+ tc->prv = pc;
- if (value == get_nat()) {
- tc->type = JT_NULL;
- return;
- } else if (enc->datetimeIso) {
- pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
- tc->type = JT_UTF8;
+ if (PyTypeNum_ISDATETIME(enc->npyType)) {
+ int64_t longVal = *(npy_int64 *)enc->npyValue;
+ if (longVal == get_nat()) {
+ tc->type = JT_NULL;
+ } else {
+ if (enc->datetimeIso) {
+ if (enc->npyType == NPY_TIMEDELTA) {
+ pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
} else {
- unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- if (scaleNanosecToUnit(&value, unit) != 0) {
- // TODO(username): Add some kind of error handling here
- }
-
- exc = PyErr_Occurred();
-
- if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
- goto INVALID;
- }
-
- tc->type = JT_LONG;
+ pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
}
- GET_TC(tc)->longValue = value;
- return;
- } else if (PyArray_IsScalar(obj, Integer)) {
+ // Currently no way to pass longVal to iso function, so use
+ // state management
+ pc->longValue = longVal;
+ tc->type = JT_UTF8;
+ } else {
+ NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+ if (scaleNanosecToUnit(&longVal, base) == -1) {
+ goto INVALID;
+ }
+ pc->longValue = longVal;
tc->type = JT_LONG;
- PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
- PyArray_DescrFromType(NPY_INT64));
-
- exc = PyErr_Occurred();
+ }
+ }
- if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
- goto INVALID;
- }
+ // TODO(username): this prevents infinite loop with
+ // mixed-type DataFrames;
+ // refactor
+ enc->npyCtxtPassthru = NULL;
+ enc->npyType = -1;
+ return;
+ }
- return;
- } else if (PyArray_IsScalar(obj, Bool)) {
- PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
- PyArray_DescrFromType(NPY_BOOL));
- tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE;
- return;
- } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) {
- PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->doubleValue),
- PyArray_DescrFromType(NPY_DOUBLE));
- tc->type = JT_DOUBLE;
- return;
- } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) {
- PyErr_Format(PyExc_TypeError,
- "%R (0d array) is not JSON serializable at the moment",
- obj);
- goto INVALID;
- } else if (object_is_na_type(obj)) {
- tc->type = JT_NULL;
- return;
- }
+ if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) {
+ goto ISITERABLE;
+ }
-ISITERABLE:
+ if (PyLong_Check(obj)) {
+ tc->type = JT_LONG;
+ int overflow = 0;
+ pc->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow);
+ int err;
+ err = (pc->longValue == -1) && PyErr_Occurred();
- if (object_is_index_type(obj)) {
- if (enc->outputFormat == SPLIT) {
- tc->type = JT_OBJECT;
- pc->iterBegin = Index_iterBegin;
- pc->iterEnd = Index_iterEnd;
- pc->iterNext = Index_iterNext;
- pc->iterGetValue = Index_iterGetValue;
- pc->iterGetName = Index_iterGetName;
- return;
- }
+ if (overflow) {
+ tc->type = JT_BIGNUM;
+ } else if (err) {
+ goto INVALID;
+ }
- pc->newObj = get_values(obj);
- if (pc->newObj) {
- tc->type = JT_ARRAY;
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
- } else {
- goto INVALID;
- }
+ return;
+ } else if (PyFloat_Check(obj)) {
+ const double val = PyFloat_AS_DOUBLE(obj);
+ if (npy_isnan(val) || npy_isinf(val)) {
+ tc->type = JT_NULL;
+ } else {
+ pc->doubleValue = val;
+ tc->type = JT_DOUBLE;
+ }
+ return;
+ } else if (PyBytes_Check(obj)) {
+ pc->PyTypeToUTF8 = PyBytesToUTF8;
+ tc->type = JT_UTF8;
+ return;
+ } else if (PyUnicode_Check(obj)) {
+ pc->PyTypeToUTF8 = PyUnicodeToUTF8;
+ tc->type = JT_UTF8;
+ return;
+ } else if (object_is_decimal_type(obj)) {
+ pc->doubleValue = PyFloat_AsDouble(obj);
+ tc->type = JT_DOUBLE;
+ return;
+ } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) {
+ if (object_is_nat_type(obj)) {
+ tc->type = JT_NULL;
+ return;
+ }
- return;
- } else if (object_is_series_type(obj)) {
- if (enc->outputFormat == SPLIT) {
- tc->type = JT_OBJECT;
- pc->iterBegin = Series_iterBegin;
- pc->iterEnd = Series_iterEnd;
- pc->iterNext = Series_iterNext;
- pc->iterGetValue = Series_iterGetValue;
- pc->iterGetName = Series_iterGetName;
- return;
- }
+ if (enc->datetimeIso) {
+ pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
+ tc->type = JT_UTF8;
+ } else {
+ NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+ pc->longValue = PyDateTimeToEpoch(obj, base);
+ tc->type = JT_LONG;
+ }
+ return;
+ } else if (PyTime_Check(obj)) {
+ pc->PyTypeToUTF8 = PyTimeToJSON;
+ tc->type = JT_UTF8;
+ return;
+ } else if (PyArray_IsScalar(obj, Datetime)) {
+ npy_int64 longVal;
+ if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) {
+ tc->type = JT_NULL;
+ return;
+ }
+ PyArray_Descr *dtype = PyArray_DescrFromScalar(obj);
+ if (!PyTypeNum_ISDATETIME(dtype->type_num)) {
+ PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime");
+ return;
+ }
+
+ PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64);
+ PyArray_CastScalarToCtype(obj, &longVal, outcode);
+ Py_DECREF(outcode);
+
+ if (enc->datetimeIso) {
+ GET_TC(tc)->longValue = longVal;
+ pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
+ enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base;
+ tc->type = JT_UTF8;
+ } else {
+ NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+ pc->longValue = PyDateTimeToEpoch(obj, base);
+ tc->type = JT_LONG;
+ }
+ return;
+ } else if (PyDelta_Check(obj)) {
+ // pd.Timedelta object or pd.NaT should evaluate true here
+ // fallback to nanoseconds per sec for other objects
+ // TODO(anyone): cast below loses precision if total_seconds return
+ // value exceeds number of bits that significand can hold
+ // also liable to overflow
+ int64_t value = PyObject_HasAttrString(obj, "_value")
+ ? get_long_attr(obj, "_value")
+ : (int64_t)(total_seconds(obj) * 1000000000LL);
+
+ if (value == get_nat()) {
+ tc->type = JT_NULL;
+ return;
+ } else if (enc->datetimeIso) {
+ pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
+ tc->type = JT_UTF8;
+ } else {
+ const int unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+ if (scaleNanosecToUnit(&value, unit) != 0) {
+ // TODO(username): Add some kind of error handling here
+ }
- pc->newObj = get_values(obj);
- if (!pc->newObj) {
- goto INVALID;
- }
+ if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_OverflowError)) {
+ goto INVALID;
+ }
- if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
- tc->type = JT_OBJECT;
- tmpObj = PyObject_GetAttrString(obj, "index");
- if (!tmpObj) {
- goto INVALID;
- }
- values = get_values(tmpObj);
- Py_DECREF(tmpObj);
- if (!values) {
- goto INVALID;
- }
- pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0);
- pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->columnLabelsLen);
- if (!pc->columnLabels) {
- goto INVALID;
- }
- } else {
- tc->type = JT_ARRAY;
- }
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
- return;
- } else if (PyArray_Check(obj)) {
- if (enc->npyCtxtPassthru) {
- pc->npyarr = enc->npyCtxtPassthru;
- tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY);
-
- pc->iterBegin = NpyArrPassThru_iterBegin;
- pc->iterNext = NpyArr_iterNext;
- pc->iterEnd = NpyArrPassThru_iterEnd;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
-
- enc->npyCtxtPassthru = NULL;
- return;
- }
+ tc->type = JT_LONG;
+ }
+ pc->longValue = value;
+ return;
+ } else if (PyArray_IsScalar(obj, Integer)) {
+ tc->type = JT_LONG;
+ PyArray_CastScalarToCtype(obj, &(pc->longValue),
+ PyArray_DescrFromType(NPY_INT64));
- tc->type = JT_ARRAY;
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
- return;
- } else if (object_is_dataframe_type(obj)) {
- if (enc->blkCtxtPassthru) {
- pc->pdblock = enc->blkCtxtPassthru;
- tc->type =
- (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY);
-
- pc->iterBegin = PdBlockPassThru_iterBegin;
- pc->iterEnd = PdBlockPassThru_iterEnd;
- pc->iterNext = PdBlock_iterNextItem;
- pc->iterGetName = PdBlock_iterGetName;
- pc->iterGetValue = NpyArr_iterGetValue;
-
- enc->blkCtxtPassthru = NULL;
- return;
- }
+ if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_OverflowError)) {
+ goto INVALID;
+ }
- if (enc->outputFormat == SPLIT) {
- tc->type = JT_OBJECT;
- pc->iterBegin = DataFrame_iterBegin;
- pc->iterEnd = DataFrame_iterEnd;
- pc->iterNext = DataFrame_iterNext;
- pc->iterGetValue = DataFrame_iterGetValue;
- pc->iterGetName = DataFrame_iterGetName;
- return;
- }
+ return;
+ } else if (PyArray_IsScalar(obj, Bool)) {
+ PyArray_CastScalarToCtype(obj, &(pc->longValue),
+ PyArray_DescrFromType(NPY_BOOL));
+ tc->type = (pc->longValue) ? JT_TRUE : JT_FALSE;
+ return;
+ } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) {
+ PyArray_CastScalarToCtype(obj, &(pc->doubleValue),
+ PyArray_DescrFromType(NPY_DOUBLE));
+ tc->type = JT_DOUBLE;
+ return;
+ } else if (PyArray_CheckScalar(obj)) {
+ PyErr_Format(PyExc_TypeError,
+ "%R (numpy-scalar) is not JSON serializable at the moment",
+ obj);
+ goto INVALID;
+ } else if (object_is_na_type(obj)) {
+ tc->type = JT_NULL;
+ return;
+ }
- if (is_simple_frame(obj)) {
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetName = NpyArr_iterGetName;
+ISITERABLE:
- pc->newObj = PyObject_GetAttrString(obj, "values");
- if (!pc->newObj) {
- goto INVALID;
- }
- } else {
- pc->iterBegin = PdBlock_iterBegin;
- pc->iterEnd = PdBlock_iterEnd;
- pc->iterNext = PdBlock_iterNext;
- pc->iterGetName = PdBlock_iterGetName;
- }
- pc->iterGetValue = NpyArr_iterGetValue;
-
- if (enc->outputFormat == VALUES) {
- tc->type = JT_ARRAY;
- } else if (enc->outputFormat == RECORDS) {
- tc->type = JT_ARRAY;
- tmpObj = PyObject_GetAttrString(obj, "columns");
- if (!tmpObj) {
- goto INVALID;
- }
- values = get_values(tmpObj);
- if (!values) {
- Py_DECREF(tmpObj);
- goto INVALID;
- }
- pc->columnLabelsLen = PyObject_Size(tmpObj);
- pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->columnLabelsLen);
- Py_DECREF(tmpObj);
- if (!pc->columnLabels) {
- goto INVALID;
- }
- } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
- tc->type = JT_OBJECT;
- tmpObj = (enc->outputFormat == INDEX
- ? PyObject_GetAttrString(obj, "index")
- : PyObject_GetAttrString(obj, "columns"));
- if (!tmpObj) {
- goto INVALID;
- }
- values = get_values(tmpObj);
- if (!values) {
- Py_DECREF(tmpObj);
- goto INVALID;
- }
- pc->rowLabelsLen = PyObject_Size(tmpObj);
- pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->rowLabelsLen);
- Py_DECREF(tmpObj);
- tmpObj = (enc->outputFormat == INDEX
- ? PyObject_GetAttrString(obj, "columns")
- : PyObject_GetAttrString(obj, "index"));
- if (!tmpObj) {
- NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
- pc->rowLabels = NULL;
- goto INVALID;
- }
- values = get_values(tmpObj);
- if (!values) {
- Py_DECREF(tmpObj);
- NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
- pc->rowLabels = NULL;
- goto INVALID;
- }
- pc->columnLabelsLen = PyObject_Size(tmpObj);
- pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->columnLabelsLen);
- Py_DECREF(tmpObj);
- if (!pc->columnLabels) {
- NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
- pc->rowLabels = NULL;
- goto INVALID;
- }
+ if (object_is_index_type(obj)) {
+ if (enc->outputFormat == SPLIT) {
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Index_iterBegin;
+ pc->iterEnd = Index_iterEnd;
+ pc->iterNext = Index_iterNext;
+ pc->iterGetValue = Index_iterGetValue;
+ pc->iterGetName = Index_iterGetName;
+ return;
+ }
+
+ pc->newObj = get_values(obj);
+ if (pc->newObj) {
+ tc->type = JT_ARRAY;
+ pc->iterBegin = NpyArr_iterBegin;
+ pc->iterEnd = NpyArr_iterEnd;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+ } else {
+ goto INVALID;
+ }
- if (enc->outputFormat == COLUMNS) {
- pc->transpose = 1;
- }
- } else {
- goto INVALID;
- }
- return;
- } else if (PyDict_Check(obj)) {
- tc->type = JT_OBJECT;
- pc->iterBegin = Dict_iterBegin;
- pc->iterEnd = Dict_iterEnd;
- pc->iterNext = Dict_iterNext;
- pc->iterGetValue = Dict_iterGetValue;
- pc->iterGetName = Dict_iterGetName;
- pc->dictObj = obj;
- Py_INCREF(obj);
-
- return;
- } else if (PyList_Check(obj)) {
- tc->type = JT_ARRAY;
- pc->iterBegin = List_iterBegin;
- pc->iterEnd = List_iterEnd;
- pc->iterNext = List_iterNext;
- pc->iterGetValue = List_iterGetValue;
- pc->iterGetName = List_iterGetName;
- return;
- } else if (PyTuple_Check(obj)) {
- tc->type = JT_ARRAY;
- pc->iterBegin = Tuple_iterBegin;
- pc->iterEnd = Tuple_iterEnd;
- pc->iterNext = Tuple_iterNext;
- pc->iterGetValue = Tuple_iterGetValue;
- pc->iterGetName = Tuple_iterGetName;
- return;
- } else if (PyAnySet_Check(obj)) {
- tc->type = JT_ARRAY;
- pc->iterBegin = Set_iterBegin;
- pc->iterEnd = Set_iterEnd;
- pc->iterNext = Set_iterNext;
- pc->iterGetValue = Set_iterGetValue;
- pc->iterGetName = Set_iterGetName;
- return;
+ return;
+ } else if (object_is_series_type(obj)) {
+ if (enc->outputFormat == SPLIT) {
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Series_iterBegin;
+ pc->iterEnd = Series_iterEnd;
+ pc->iterNext = Series_iterNext;
+ pc->iterGetValue = Series_iterGetValue;
+ pc->iterGetName = Series_iterGetName;
+ return;
+ }
+
+ pc->newObj = get_values(obj);
+ if (!pc->newObj) {
+ goto INVALID;
+ }
+
+ if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
+ tc->type = JT_OBJECT;
+ PyObject *tmpObj = PyObject_GetAttrString(obj, "index");
+ if (!tmpObj) {
+ goto INVALID;
+ }
+ PyObject *values = get_values(tmpObj);
+ Py_DECREF(tmpObj);
+ if (!values) {
+ goto INVALID;
+ }
+ pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0);
+ pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
+ pc->columnLabelsLen);
+ if (!pc->columnLabels) {
+ goto INVALID;
+ }
+ } else {
+ tc->type = JT_ARRAY;
}
+ pc->iterBegin = NpyArr_iterBegin;
+ pc->iterEnd = NpyArr_iterEnd;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+ return;
+ } else if (PyArray_Check(obj)) {
+ if (enc->npyCtxtPassthru) {
+ pc->npyarr = enc->npyCtxtPassthru;
+ tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY);
+
+ pc->iterBegin = NpyArrPassThru_iterBegin;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterEnd = NpyArrPassThru_iterEnd;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+
+ enc->npyCtxtPassthru = NULL;
+ return;
+ }
+
+ tc->type = JT_ARRAY;
+ pc->iterBegin = NpyArr_iterBegin;
+ pc->iterEnd = NpyArr_iterEnd;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+ return;
+ } else if (object_is_dataframe_type(obj)) {
+ if (enc->blkCtxtPassthru) {
+ pc->pdblock = enc->blkCtxtPassthru;
+ tc->type =
+ (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY);
+
+ pc->iterBegin = PdBlockPassThru_iterBegin;
+ pc->iterEnd = PdBlockPassThru_iterEnd;
+ pc->iterNext = PdBlock_iterNextItem;
+ pc->iterGetName = PdBlock_iterGetName;
+ pc->iterGetValue = NpyArr_iterGetValue;
+
+ enc->blkCtxtPassthru = NULL;
+ return;
+ }
+
+ if (enc->outputFormat == SPLIT) {
+ tc->type = JT_OBJECT;
+ pc->iterBegin = DataFrame_iterBegin;
+ pc->iterEnd = DataFrame_iterEnd;
+ pc->iterNext = DataFrame_iterNext;
+ pc->iterGetValue = DataFrame_iterGetValue;
+ pc->iterGetName = DataFrame_iterGetName;
+ return;
+ }
+
+ pc->iterBegin = PdBlock_iterBegin;
+ pc->iterEnd = PdBlock_iterEnd;
+ pc->iterNext = PdBlock_iterNext;
+ pc->iterGetName = PdBlock_iterGetName;
+ pc->iterGetValue = NpyArr_iterGetValue;
+
+ if (enc->outputFormat == VALUES) {
+ tc->type = JT_ARRAY;
+ } else if (enc->outputFormat == RECORDS) {
+ tc->type = JT_ARRAY;
+ PyObject *tmpObj = PyObject_GetAttrString(obj, "columns");
+ if (!tmpObj) {
+ goto INVALID;
+ }
+ PyObject *values = get_values(tmpObj);
+ if (!values) {
+ Py_DECREF(tmpObj);
+ goto INVALID;
+ }
+ pc->columnLabelsLen = PyObject_Size(tmpObj);
+ pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
+ pc->columnLabelsLen);
+ Py_DECREF(tmpObj);
+ if (!pc->columnLabels) {
+ goto INVALID;
+ }
+ } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
+ tc->type = JT_OBJECT;
+ PyObject *tmpObj =
+ (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "index")
+ : PyObject_GetAttrString(obj, "columns"));
+ if (!tmpObj) {
+ goto INVALID;
+ }
+ PyObject *values = get_values(tmpObj);
+ if (!values) {
+ Py_DECREF(tmpObj);
+ goto INVALID;
+ }
+ pc->rowLabelsLen = PyObject_Size(tmpObj);
+ pc->rowLabels =
+ NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->rowLabelsLen);
+ Py_DECREF(tmpObj);
+ tmpObj =
+ (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns")
+ : PyObject_GetAttrString(obj, "index"));
+ if (!tmpObj) {
+ NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
+ pc->rowLabels = NULL;
+ goto INVALID;
+ }
+ values = get_values(tmpObj);
+ if (!values) {
+ Py_DECREF(tmpObj);
+ NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
+ pc->rowLabels = NULL;
+ goto INVALID;
+ }
+ pc->columnLabelsLen = PyObject_Size(tmpObj);
+ pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
+ pc->columnLabelsLen);
+ Py_DECREF(tmpObj);
+ if (!pc->columnLabels) {
+ NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
+ pc->rowLabels = NULL;
+ goto INVALID;
+ }
- toDictFunc = PyObject_GetAttrString(obj, "toDict");
+ if (enc->outputFormat == COLUMNS) {
+ pc->transpose = 1;
+ }
+ } else {
+ goto INVALID;
+ }
+ return;
+ } else if (PyDict_Check(obj)) {
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Dict_iterBegin;
+ pc->iterEnd = Dict_iterEnd;
+ pc->iterNext = Dict_iterNext;
+ pc->iterGetValue = Dict_iterGetValue;
+ pc->iterGetName = Dict_iterGetName;
+ pc->dictObj = obj;
+ Py_INCREF(obj);
- if (toDictFunc) {
- PyObject *tuple = PyTuple_New(0);
- PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL);
- Py_DECREF(tuple);
- Py_DECREF(toDictFunc);
+ return;
+ } else if (PyList_Check(obj)) {
+ tc->type = JT_ARRAY;
+ pc->iterBegin = List_iterBegin;
+ pc->iterEnd = List_iterEnd;
+ pc->iterNext = List_iterNext;
+ pc->iterGetValue = List_iterGetValue;
+ pc->iterGetName = List_iterGetName;
+ return;
+ } else if (PyTuple_Check(obj)) {
+ tc->type = JT_ARRAY;
+ pc->iterBegin = Tuple_iterBegin;
+ pc->iterEnd = Tuple_iterEnd;
+ pc->iterNext = Tuple_iterNext;
+ pc->iterGetValue = Tuple_iterGetValue;
+ pc->iterGetName = Tuple_iterGetName;
+ return;
+ } else if (PyAnySet_Check(obj)) {
+ tc->type = JT_ARRAY;
+ pc->iterBegin = Set_iterBegin;
+ pc->iterEnd = Set_iterEnd;
+ pc->iterNext = Set_iterNext;
+ pc->iterGetValue = Set_iterGetValue;
+ pc->iterGetName = Set_iterGetName;
+ return;
+ }
- if (toDictResult == NULL) {
- PyErr_Clear();
- tc->type = JT_NULL;
- return;
- }
+ PyObject *toDictFunc = PyObject_GetAttrString(obj, "toDict");
- if (!PyDict_Check(toDictResult)) {
- Py_DECREF(toDictResult);
- tc->type = JT_NULL;
- return;
- }
+ if (toDictFunc) {
+ PyObject *tuple = PyTuple_New(0);
+ PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL);
+ Py_DECREF(tuple);
+ Py_DECREF(toDictFunc);
- tc->type = JT_OBJECT;
- pc->iterBegin = Dict_iterBegin;
- pc->iterEnd = Dict_iterEnd;
- pc->iterNext = Dict_iterNext;
- pc->iterGetValue = Dict_iterGetValue;
- pc->iterGetName = Dict_iterGetName;
- pc->dictObj = toDictResult;
- return;
+ if (toDictResult == NULL) {
+ PyErr_Clear();
+ tc->type = JT_NULL;
+ return;
}
- PyErr_Clear();
-
- if (enc->defaultHandler) {
- Object_invokeDefaultHandler(obj, enc);
- goto INVALID;
+ if (!PyDict_Check(toDictResult)) {
+ Py_DECREF(toDictResult);
+ tc->type = JT_NULL;
+ return;
}
tc->type = JT_OBJECT;
- pc->iterBegin = Dir_iterBegin;
- pc->iterEnd = Dir_iterEnd;
- pc->iterNext = Dir_iterNext;
- pc->iterGetValue = Dir_iterGetValue;
- pc->iterGetName = Dir_iterGetName;
+ pc->iterBegin = Dict_iterBegin;
+ pc->iterEnd = Dict_iterEnd;
+ pc->iterNext = Dict_iterNext;
+ pc->iterGetValue = Dict_iterGetValue;
+ pc->iterGetName = Dict_iterGetName;
+ pc->dictObj = toDictResult;
return;
+ }
+
+ PyErr_Clear();
+
+ if (enc->defaultHandler) {
+ Object_invokeDefaultHandler(obj, enc);
+ goto INVALID;
+ }
+
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Dir_iterBegin;
+ pc->iterEnd = Dir_iterEnd;
+ pc->iterNext = Dir_iterNext;
+ pc->iterGetValue = Dir_iterGetValue;
+ pc->iterGetName = Dir_iterGetName;
+ return;
INVALID:
- tc->type = JT_INVALID;
+ tc->type = JT_INVALID;
+ PyObject_Free(tc->prv);
+ tc->prv = NULL;
+ return;
+}
+
+static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ if (tc->prv) {
+ Py_XDECREF(GET_TC(tc)->newObj);
+ GET_TC(tc)->newObj = NULL;
+ NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen);
+ GET_TC(tc)->rowLabels = NULL;
+ NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen);
+ GET_TC(tc)->columnLabels = NULL;
+ PyObject_Free(GET_TC(tc)->cStr);
+ GET_TC(tc)->cStr = NULL;
PyObject_Free(tc->prv);
tc->prv = NULL;
- return;
-}
-
-void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (tc->prv) {
- Py_XDECREF(GET_TC(tc)->newObj);
- GET_TC(tc)->newObj = NULL;
- NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen);
- GET_TC(tc)->rowLabels = NULL;
- NpyArr_freeLabels(GET_TC(tc)->columnLabels,
- GET_TC(tc)->columnLabelsLen);
- GET_TC(tc)->columnLabels = NULL;
- PyObject_Free(GET_TC(tc)->cStr);
- GET_TC(tc)->cStr = NULL;
- PyObject_Free(tc->prv);
- tc->prv = NULL;
- }
+ }
}
-const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc,
- size_t *_outLen) {
- return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen);
+static const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc,
+ size_t *_outLen) {
+ return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen);
}
-JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->longValue;
+static JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->longValue;
}
-double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->doubleValue;
+static double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
+ return GET_TC(tc)->doubleValue;
}
-const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc,
- size_t *_outLen) {
- PyObject *repr = PyObject_Str(obj);
- const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen);
- char *bytes = PyObject_Malloc(*_outLen + 1);
- memcpy(bytes, str, *_outLen + 1);
- GET_TC(tc)->cStr = bytes;
+static const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc,
+ size_t *_outLen) {
+ PyObject *repr = PyObject_Str(obj);
+ const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen);
+ char *bytes = PyObject_Malloc(*_outLen + 1);
+ memcpy(bytes, str, *_outLen + 1);
+ GET_TC(tc)->cStr = bytes;
- Py_DECREF(repr);
+ Py_DECREF(repr);
- return GET_TC(tc)->cStr;
+ return GET_TC(tc)->cStr;
}
static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); }
-void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->iterBegin(obj, tc);
+static void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->iterBegin(obj, tc);
}
-int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- return GET_TC(tc)->iterNext(obj, tc);
+static int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->iterNext(obj, tc);
}
-void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->iterEnd(obj, tc);
+static void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->iterEnd(obj, tc);
}
-JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
- return GET_TC(tc)->iterGetValue(obj, tc);
+static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->iterGetValue(obj, tc);
}
-char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
- return GET_TC(tc)->iterGetName(obj, tc, outLen);
+static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc,
+ size_t *outLen) {
+ return GET_TC(tc)->iterGetName(obj, tc, outLen);
}
PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
PyObject *kwargs) {
- PyDateTime_IMPORT;
- if (PyDateTimeAPI == NULL) {
- return NULL;
- }
+ PyDateTime_IMPORT;
+ if (PyDateTimeAPI == NULL) {
+ return NULL;
+ }
- PandasDateTime_IMPORT;
- if (PandasDateTimeAPI == NULL) {
- return NULL;
- }
+ PandasDateTime_IMPORT;
+ if (PandasDateTimeAPI == NULL) {
+ return NULL;
+ }
+
+ static char *kwlist[] = {"obj",
+ "ensure_ascii",
+ "double_precision",
+ "encode_html_chars",
+ "orient",
+ "date_unit",
+ "iso_dates",
+ "default_handler",
+ "indent",
+ NULL};
+
+ PyObject *oinput = NULL;
+ PyObject *oensureAscii = NULL;
+ int idoublePrecision = 10; // default double precision setting
+ PyObject *oencodeHTMLChars = NULL;
+ char *sOrient = NULL;
+ char *sdateFormat = NULL;
+ PyObject *oisoDates = 0;
+ PyObject *odefHandler = 0;
+ int indent = 0;
+
+ PyObjectEncoder pyEncoder = {
+ {
+ .beginTypeContext = Object_beginTypeContext,
+ .endTypeContext = Object_endTypeContext,
+ .getStringValue = Object_getStringValue,
+ .getLongValue = Object_getLongValue,
+ .getIntValue = NULL,
+ .getDoubleValue = Object_getDoubleValue,
+ .getBigNumStringValue = Object_getBigNumStringValue,
+ .iterBegin = Object_iterBegin,
+ .iterNext = Object_iterNext,
+ .iterEnd = Object_iterEnd,
+ .iterGetValue = Object_iterGetValue,
+ .iterGetName = Object_iterGetName,
+ .releaseObject = Object_releaseObject,
+ .malloc = PyObject_Malloc,
+ .realloc = PyObject_Realloc,
+ .free = PyObject_Free,
+ .recursionMax = -1,
+ .doublePrecision = idoublePrecision,
+ .forceASCII = 1,
+ .encodeHTMLChars = 0,
+ .indent = indent,
+ .errorMsg = NULL,
+ },
+ .npyCtxtPassthru = NULL,
+ .blkCtxtPassthru = NULL,
+ .npyType = -1,
+ .npyValue = NULL,
+ .datetimeIso = 0,
+ .datetimeUnit = NPY_FR_ms,
+ .outputFormat = COLUMNS,
+ .defaultHandler = NULL,
+ };
+ JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, &oinput,
+ &oensureAscii, &idoublePrecision,
+ &oencodeHTMLChars, &sOrient, &sdateFormat,
+ &oisoDates, &odefHandler, &indent)) {
+ return NULL;
+ }
- static char *kwlist[] = {"obj",
- "ensure_ascii",
- "double_precision",
- "encode_html_chars",
- "orient",
- "date_unit",
- "iso_dates",
- "default_handler",
- "indent",
- NULL};
-
- char buffer[65536];
- char *ret;
- PyObject *newobj;
- PyObject *oinput = NULL;
- PyObject *oensureAscii = NULL;
- int idoublePrecision = 10; // default double precision setting
- PyObject *oencodeHTMLChars = NULL;
- char *sOrient = NULL;
- char *sdateFormat = NULL;
- PyObject *oisoDates = 0;
- PyObject *odefHandler = 0;
- int indent = 0;
-
- PyObjectEncoder pyEncoder = {{
- Object_beginTypeContext,
- Object_endTypeContext,
- Object_getStringValue,
- Object_getLongValue,
- NULL, // getIntValue is unused
- Object_getDoubleValue,
- Object_getBigNumStringValue,
- Object_iterBegin,
- Object_iterNext,
- Object_iterEnd,
- Object_iterGetValue,
- Object_iterGetName,
- Object_releaseObject,
- PyObject_Malloc,
- PyObject_Realloc,
- PyObject_Free,
- -1, // recursionMax
- idoublePrecision,
- 1, // forceAscii
- 0, // encodeHTMLChars
- 0, // indent
- }};
- JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder;
-
- pyEncoder.npyCtxtPassthru = NULL;
- pyEncoder.blkCtxtPassthru = NULL;
- pyEncoder.npyType = -1;
- pyEncoder.npyValue = NULL;
- pyEncoder.datetimeIso = 0;
- pyEncoder.datetimeUnit = NPY_FR_ms;
- pyEncoder.outputFormat = COLUMNS;
- pyEncoder.defaultHandler = 0;
-
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist,
- &oinput, &oensureAscii, &idoublePrecision,
- &oencodeHTMLChars, &sOrient, &sdateFormat,
- &oisoDates, &odefHandler, &indent)) {
- return NULL;
- }
+ if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) {
+ encoder->forceASCII = 0;
+ }
- if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) {
- encoder->forceASCII = 0;
- }
+ if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) {
+ encoder->encodeHTMLChars = 1;
+ }
- if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) {
- encoder->encodeHTMLChars = 1;
+ if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) {
+ PyErr_Format(
+ PyExc_ValueError,
+ "Invalid value '%d' for option 'double_precision', max is '%u'",
+ idoublePrecision, JSON_DOUBLE_MAX_DECIMALS);
+ return NULL;
+ }
+ encoder->doublePrecision = idoublePrecision;
+
+ if (sOrient != NULL) {
+ if (strcmp(sOrient, "records") == 0) {
+ pyEncoder.outputFormat = RECORDS;
+ } else if (strcmp(sOrient, "index") == 0) {
+ pyEncoder.outputFormat = INDEX;
+ } else if (strcmp(sOrient, "split") == 0) {
+ pyEncoder.outputFormat = SPLIT;
+ } else if (strcmp(sOrient, "values") == 0) {
+ pyEncoder.outputFormat = VALUES;
+ } else if (strcmp(sOrient, "columns") != 0) {
+ PyErr_Format(PyExc_ValueError, "Invalid value '%s' for option 'orient'",
+ sOrient);
+ return NULL;
+ }
+ }
+
+ if (sdateFormat != NULL) {
+ if (strcmp(sdateFormat, "s") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_s;
+ } else if (strcmp(sdateFormat, "ms") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_ms;
+ } else if (strcmp(sdateFormat, "us") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_us;
+ } else if (strcmp(sdateFormat, "ns") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_ns;
+ } else {
+ PyErr_Format(PyExc_ValueError,
+ "Invalid value '%s' for option 'date_unit'", sdateFormat);
+ return NULL;
}
+ }
- if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) {
- PyErr_Format(
- PyExc_ValueError,
- "Invalid value '%d' for option 'double_precision', max is '%u'",
- idoublePrecision, JSON_DOUBLE_MAX_DECIMALS);
- return NULL;
- }
- encoder->doublePrecision = idoublePrecision;
-
- if (sOrient != NULL) {
- if (strcmp(sOrient, "records") == 0) {
- pyEncoder.outputFormat = RECORDS;
- } else if (strcmp(sOrient, "index") == 0) {
- pyEncoder.outputFormat = INDEX;
- } else if (strcmp(sOrient, "split") == 0) {
- pyEncoder.outputFormat = SPLIT;
- } else if (strcmp(sOrient, "values") == 0) {
- pyEncoder.outputFormat = VALUES;
- } else if (strcmp(sOrient, "columns") != 0) {
- PyErr_Format(PyExc_ValueError,
- "Invalid value '%s' for option 'orient'", sOrient);
- return NULL;
- }
- }
+ if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) {
+ pyEncoder.datetimeIso = 1;
+ }
- if (sdateFormat != NULL) {
- if (strcmp(sdateFormat, "s") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_s;
- } else if (strcmp(sdateFormat, "ms") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_ms;
- } else if (strcmp(sdateFormat, "us") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_us;
- } else if (strcmp(sdateFormat, "ns") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_ns;
- } else {
- PyErr_Format(PyExc_ValueError,
- "Invalid value '%s' for option 'date_unit'",
- sdateFormat);
- return NULL;
- }
+ if (odefHandler != NULL && odefHandler != Py_None) {
+ if (!PyCallable_Check(odefHandler)) {
+ PyErr_SetString(PyExc_TypeError, "Default handler is not callable");
+ return NULL;
}
+ pyEncoder.defaultHandler = odefHandler;
+ }
- if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) {
- pyEncoder.datetimeIso = 1;
- }
+ encoder->indent = indent;
- if (odefHandler != NULL && odefHandler != Py_None) {
- if (!PyCallable_Check(odefHandler)) {
- PyErr_SetString(PyExc_TypeError, "Default handler is not callable");
- return NULL;
- }
- pyEncoder.defaultHandler = odefHandler;
- }
+ pyEncoder.originalOutputFormat = pyEncoder.outputFormat;
- encoder->indent = indent;
-
- pyEncoder.originalOutputFormat = pyEncoder.outputFormat;
- ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer));
- if (PyErr_Occurred()) {
- return NULL;
- }
+ char buffer[65536];
+ char *ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer));
+ if (PyErr_Occurred()) {
+ return NULL;
+ }
- if (encoder->errorMsg) {
- if (ret != buffer) {
- encoder->free(ret);
- }
- PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg);
- return NULL;
+ if (encoder->errorMsg) {
+ if (ret != buffer) {
+ encoder->free(ret);
}
+ PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg);
+ return NULL;
+ }
- newobj = PyUnicode_FromString(ret);
+ PyObject *newobj = PyUnicode_FromString(ret);
- if (ret != buffer) {
- encoder->free(ret);
- }
+ if (ret != buffer) {
+ encoder->free(ret);
+ }
- return newobj;
+ return newobj;
}
diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c
index 6a6a1bdcbe0e0..075411a23b075 100644
--- a/pandas/_libs/src/vendored/ujson/python/ujson.c
+++ b/pandas/_libs/src/vendored/ujson/python/ujson.c
@@ -16,18 +16,19 @@ derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
+reserved.
Numeric decoder derived from TCL library
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
@@ -35,7 +36,8 @@ Numeric decoder derived from TCL library
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
-#include "pandas/vendored/ujson/python/version.h"
+// Licence at LICENSES/ULTRAJSON_LICENSE
+
#define PY_SSIZE_T_CLEAN
#include
#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
@@ -48,27 +50,29 @@ void *initObjToJSON(void);
/* JSONToObj */
PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs);
-#define ENCODER_HELP_TEXT \
- "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \
- "alter the maximum digit precision of doubles. Set " \
- "encode_html_chars=True to encode < > & as unicode escape sequences."
+#define ENCODER_HELP_TEXT \
+ "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \
+ "alter the maximum digit precision of doubles. Set " \
+ "encode_html_chars=True to encode < > & as unicode escape sequences."
static PyMethodDef ujsonMethods[] = {
- {"ujson_dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
+ {"ujson_dumps", (PyCFunction)(void (*)(void))objToJSON,
+ METH_VARARGS | METH_KEYWORDS,
"Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
- {"ujson_loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
+ {"ujson_loads", (PyCFunction)(void (*)(void))JSONToObj,
+ METH_VARARGS | METH_KEYWORDS,
"Converts JSON as string to dict object structure. Use precise_float=True "
"to use high precision float decoder."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
typedef struct {
- PyObject *type_decimal;
- PyObject *type_dataframe;
- PyObject *type_series;
- PyObject *type_index;
- PyObject *type_nat;
- PyObject *type_na;
+ PyObject *type_decimal;
+ PyObject *type_dataframe;
+ PyObject *type_series;
+ PyObject *type_index;
+ PyObject *type_nat;
+ PyObject *type_na;
} modulestate;
#define modulestate(o) ((modulestate *)PyModule_GetState(o))
@@ -88,359 +92,356 @@ static struct PyModuleDef moduledef = {.m_base = PyModuleDef_HEAD_INIT,
#ifndef PYPY_VERSION
/* Used in objToJSON.c */
int object_is_decimal_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_decimal = state->type_decimal;
- if (type_decimal == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_decimal);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyState_FindModule(&moduledef);
+ if (module == NULL)
+ return 0;
+ modulestate *state = modulestate(module);
+ if (state == NULL)
+ return 0;
+ PyObject *type_decimal = state->type_decimal;
+ if (type_decimal == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_decimal);
+ if (result == -1) {
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_dataframe_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_dataframe = state->type_dataframe;
- if (type_dataframe == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_dataframe);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyState_FindModule(&moduledef);
+ if (module == NULL)
+ return 0;
+ modulestate *state = modulestate(module);
+ if (state == NULL)
+ return 0;
+ PyObject *type_dataframe = state->type_dataframe;
+ if (type_dataframe == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_dataframe);
+ if (result == -1) {
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_series_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_series = state->type_series;
- if (type_series == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_series);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyState_FindModule(&moduledef);
+ if (module == NULL)
+ return 0;
+ modulestate *state = modulestate(module);
+ if (state == NULL)
+ return 0;
+ PyObject *type_series = state->type_series;
+ if (type_series == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_series);
+ if (result == -1) {
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_index_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_index = state->type_index;
- if (type_index == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_index);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyState_FindModule(&moduledef);
+ if (module == NULL)
+ return 0;
+ modulestate *state = modulestate(module);
+ if (state == NULL)
+ return 0;
+ PyObject *type_index = state->type_index;
+ if (type_index == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_index);
+ if (result == -1) {
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_nat_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_nat = state->type_nat;
- if (type_nat == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_nat);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyState_FindModule(&moduledef);
+ if (module == NULL)
+ return 0;
+ modulestate *state = modulestate(module);
+ if (state == NULL)
+ return 0;
+ PyObject *type_nat = state->type_nat;
+ if (type_nat == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_nat);
+ if (result == -1) {
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_na_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_na = state->type_na;
- if (type_na == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_na);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyState_FindModule(&moduledef);
+ if (module == NULL)
+ return 0;
+ modulestate *state = modulestate(module);
+ if (state == NULL)
+ return 0;
+ PyObject *type_na = state->type_na;
+ if (type_na == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_na);
+ if (result == -1) {
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
#else
- /* Used in objToJSON.c */
+/* Used in objToJSON.c */
int object_is_decimal_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("decimal");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal");
- if (type_decimal == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_decimal);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_decimal);
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyImport_ImportModule("decimal");
+ if (module == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal");
+ if (type_decimal == NULL) {
+ Py_DECREF(module);
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_decimal);
+ if (result == -1) {
+ Py_DECREF(module);
+ Py_DECREF(type_decimal);
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_dataframe_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame");
- if (type_dataframe == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_dataframe);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_dataframe);
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyImport_ImportModule("pandas");
+ if (module == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame");
+ if (type_dataframe == NULL) {
+ Py_DECREF(module);
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_dataframe);
+ if (result == -1) {
+ Py_DECREF(module);
+ Py_DECREF(type_dataframe);
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_series_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_series = PyObject_GetAttrString(module, "Series");
- if (type_series == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_series);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_series);
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyImport_ImportModule("pandas");
+ if (module == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ PyObject *type_series = PyObject_GetAttrString(module, "Series");
+ if (type_series == NULL) {
+ Py_DECREF(module);
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_series);
+ if (result == -1) {
+ Py_DECREF(module);
+ Py_DECREF(type_series);
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_index_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_index = PyObject_GetAttrString(module, "Index");
- if (type_index == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_index);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_index);
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyImport_ImportModule("pandas");
+ if (module == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ PyObject *type_index = PyObject_GetAttrString(module, "Index");
+ if (type_index == NULL) {
+ Py_DECREF(module);
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_index);
+ if (result == -1) {
+ Py_DECREF(module);
+ Py_DECREF(type_index);
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_nat_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_nat = PyObject_GetAttrString(module, "NaTType");
- if (type_nat == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_nat);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_nat);
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype");
+ if (module == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ PyObject *type_nat = PyObject_GetAttrString(module, "NaTType");
+ if (type_nat == NULL) {
+ Py_DECREF(module);
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_nat);
+ if (result == -1) {
+ Py_DECREF(module);
+ Py_DECREF(type_nat);
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
int object_is_na_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas._libs.missing");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_na = PyObject_GetAttrString(module, "NAType");
- if (type_na == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_na);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_na);
- PyErr_Clear();
- return 0;
- }
- return result;
+ PyObject *module = PyImport_ImportModule("pandas._libs.missing");
+ if (module == NULL) {
+ PyErr_Clear();
+ return 0;
+ }
+ PyObject *type_na = PyObject_GetAttrString(module, "NAType");
+ if (type_na == NULL) {
+ Py_DECREF(module);
+ PyErr_Clear();
+ return 0;
+ }
+ int result = PyObject_IsInstance(obj, type_na);
+ if (result == -1) {
+ Py_DECREF(module);
+ Py_DECREF(type_na);
+ PyErr_Clear();
+ return 0;
+ }
+ return result;
}
#endif
static int module_traverse(PyObject *m, visitproc visit, void *arg) {
- Py_VISIT(modulestate(m)->type_decimal);
- Py_VISIT(modulestate(m)->type_dataframe);
- Py_VISIT(modulestate(m)->type_series);
- Py_VISIT(modulestate(m)->type_index);
- Py_VISIT(modulestate(m)->type_nat);
- Py_VISIT(modulestate(m)->type_na);
- return 0;
+ Py_VISIT(modulestate(m)->type_decimal);
+ Py_VISIT(modulestate(m)->type_dataframe);
+ Py_VISIT(modulestate(m)->type_series);
+ Py_VISIT(modulestate(m)->type_index);
+ Py_VISIT(modulestate(m)->type_nat);
+ Py_VISIT(modulestate(m)->type_na);
+ return 0;
}
static int module_clear(PyObject *m) {
- Py_CLEAR(modulestate(m)->type_decimal);
- Py_CLEAR(modulestate(m)->type_dataframe);
- Py_CLEAR(modulestate(m)->type_series);
- Py_CLEAR(modulestate(m)->type_index);
- Py_CLEAR(modulestate(m)->type_nat);
- Py_CLEAR(modulestate(m)->type_na);
- return 0;
+ Py_CLEAR(modulestate(m)->type_decimal);
+ Py_CLEAR(modulestate(m)->type_dataframe);
+ Py_CLEAR(modulestate(m)->type_series);
+ Py_CLEAR(modulestate(m)->type_index);
+ Py_CLEAR(modulestate(m)->type_nat);
+ Py_CLEAR(modulestate(m)->type_na);
+ return 0;
}
static void module_free(void *module) { module_clear((PyObject *)module); }
PyMODINIT_FUNC PyInit_json(void) {
- import_array()
- PyObject *module;
+ import_array() PyObject *module;
#ifndef PYPY_VERSION
- // This function is not supported in PyPy.
- if ((module = PyState_FindModule(&moduledef)) != NULL) {
- Py_INCREF(module);
- return module;
- }
+ // This function is not supported in PyPy.
+ if ((module = PyState_FindModule(&moduledef)) != NULL) {
+ Py_INCREF(module);
+ return module;
+ }
#endif
- module = PyModule_Create(&moduledef);
- if (module == NULL) {
- return NULL;
- }
+ module = PyModule_Create(&moduledef);
+ if (module == NULL) {
+ return NULL;
+ }
#ifndef PYPY_VERSION
- PyObject *mod_decimal = PyImport_ImportModule("decimal");
- if (mod_decimal) {
- PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal");
- assert(type_decimal != NULL);
- modulestate(module)->type_decimal = type_decimal;
- Py_DECREF(mod_decimal);
- }
-
- PyObject *mod_pandas = PyImport_ImportModule("pandas");
- if (mod_pandas) {
- PyObject *type_dataframe =
- PyObject_GetAttrString(mod_pandas, "DataFrame");
- assert(type_dataframe != NULL);
- modulestate(module)->type_dataframe = type_dataframe;
-
- PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series");
- assert(type_series != NULL);
- modulestate(module)->type_series = type_series;
-
- PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index");
- assert(type_index != NULL);
- modulestate(module)->type_index = type_index;
-
- Py_DECREF(mod_pandas);
- }
-
- PyObject *mod_nattype =
- PyImport_ImportModule("pandas._libs.tslibs.nattype");
- if (mod_nattype) {
- PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType");
- assert(type_nat != NULL);
- modulestate(module)->type_nat = type_nat;
-
- Py_DECREF(mod_nattype);
- }
-
- PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing");
- if (mod_natype) {
- PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType");
- assert(type_na != NULL);
- modulestate(module)->type_na = type_na;
-
- Py_DECREF(mod_natype);
- } else {
- PyErr_Clear();
- }
+ PyObject *mod_decimal = PyImport_ImportModule("decimal");
+ if (mod_decimal) {
+ PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal");
+ assert(type_decimal != NULL);
+ modulestate(module)->type_decimal = type_decimal;
+ Py_DECREF(mod_decimal);
+ }
+
+ PyObject *mod_pandas = PyImport_ImportModule("pandas");
+ if (mod_pandas) {
+ PyObject *type_dataframe = PyObject_GetAttrString(mod_pandas, "DataFrame");
+ assert(type_dataframe != NULL);
+ modulestate(module)->type_dataframe = type_dataframe;
+
+ PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series");
+ assert(type_series != NULL);
+ modulestate(module)->type_series = type_series;
+
+ PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index");
+ assert(type_index != NULL);
+ modulestate(module)->type_index = type_index;
+
+ Py_DECREF(mod_pandas);
+ }
+
+ PyObject *mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype");
+ if (mod_nattype) {
+ PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType");
+ assert(type_nat != NULL);
+ modulestate(module)->type_nat = type_nat;
+
+ Py_DECREF(mod_nattype);
+ }
+
+ PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing");
+ if (mod_natype) {
+ PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType");
+ assert(type_na != NULL);
+ modulestate(module)->type_na = type_na;
+
+ Py_DECREF(mod_natype);
+ } else {
+ PyErr_Clear();
+ }
#endif
- /* Not vendored for now
- JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError",
- PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if
- (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0)
- {
- Py_XDECREF(JSONDecodeError);
- Py_CLEAR(JSONDecodeError);
- Py_DECREF(module);
- return NULL;
- }
- */
-
- return module;
+ /* Not vendored for now
+ JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError",
+ PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if
+ (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0)
+ {
+ Py_XDECREF(JSONDecodeError);
+ Py_CLEAR(JSONDecodeError);
+ Py_DECREF(module);
+ return NULL;
+ }
+ */
+
+ return module;
}
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
index 4ba7bce51ed64..aed0f4b082d4e 100644
--- a/pandas/_libs/testing.pyx
+++ b/pandas/_libs/testing.pyx
@@ -78,7 +78,7 @@ cpdef assert_almost_equal(a, b,
robj : str, default None
Specify right object name being compared, internally used to show
appropriate assertion message.
- index_values : ndarray, default None
+ index_values : Index | ndarray, default None
Specify shared index values of objects being compared, internally used
to show appropriate assertion message.
diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
index 9819b5173db56..5a340c1d88bc4 100644
--- a/pandas/_libs/tslib.pyi
+++ b/pandas/_libs/tslib.pyi
@@ -23,10 +23,15 @@ def array_to_datetime(
dayfirst: bool = ...,
yearfirst: bool = ...,
utc: bool = ...,
+ creso: int = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...
# returned ndarray may be object dtype or datetime64[ns]
def array_to_datetime_with_tz(
- values: npt.NDArray[np.object_], tz: tzinfo
+ values: npt.NDArray[np.object_],
+ tz: tzinfo,
+ dayfirst: bool,
+ yearfirst: bool,
+ creso: int,
) -> npt.NDArray[np.int64]: ...
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 20a18cf56779f..017fdc4bc834f 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -30,10 +30,14 @@ import numpy as np
cnp.import_array()
+from pandas._libs.tslibs.dtypes cimport (
+ get_supported_reso,
+ npy_unit_to_abbrev,
+)
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
- check_dts_bounds,
+ get_datetime64_unit,
import_pandas_datetime,
npy_datetimestruct,
npy_datetimestruct_to_datetime,
@@ -45,9 +49,11 @@ from pandas._libs.tslibs.np_datetime cimport (
import_pandas_datetime()
-from pandas._libs.tslibs.strptime cimport parse_today_now
+from pandas._libs.tslibs.strptime cimport (
+ DatetimeParseState,
+ parse_today_now,
+)
from pandas._libs.util cimport (
- is_datetime64_object,
is_float_object,
is_integer_object,
)
@@ -58,16 +64,18 @@ from pandas._libs.tslibs.conversion cimport (
_TSObject,
cast_from_unit,
convert_str_to_tsobject,
- convert_timezone,
+ convert_to_tsobject,
get_datetime64_nanos,
parse_pydatetime,
)
+from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.timestamps cimport _Timestamp
+from pandas._libs.tslibs.timezones cimport tz_compare
from pandas._libs.tslibs import (
Resolution,
@@ -94,8 +102,10 @@ def _test_parse_iso8601(ts: str):
obj = _TSObject()
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
- obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
- check_dts_bounds(&obj.dts)
+ try:
+ obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
+ except OverflowError as err:
+ raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {ts}") from err
if out_local == 1:
obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo)
@@ -275,6 +285,7 @@ def array_with_unit_to_datetime(
result, tz = array_to_datetime(
values.astype(object, copy=False),
errors=errors,
+ creso=NPY_FR_ns,
)
return result, tz
@@ -327,7 +338,7 @@ def array_with_unit_to_datetime(
f"unit='{unit}' not valid with non-numerical val='{val}'"
)
- except (ValueError, OutOfBoundsDatetime, TypeError) as err:
+ except (ValueError, TypeError) as err:
if is_raise:
err.args = (f"{err}, at position {i}",)
raise
@@ -406,6 +417,7 @@ cpdef array_to_datetime(
bint dayfirst=False,
bint yearfirst=False,
bint utc=False,
+ NPY_DATETIMEUNIT creso=NPY_FR_ns,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
@@ -423,25 +435,27 @@ cpdef array_to_datetime(
Parameters
----------
values : ndarray of object
- date-like objects to convert
+ date-like objects to convert
errors : str, default 'raise'
- error behavior when parsing
+ error behavior when parsing
dayfirst : bool, default False
- dayfirst parsing behavior when encountering datetime strings
+ dayfirst parsing behavior when encountering datetime strings
yearfirst : bool, default False
- yearfirst parsing behavior when encountering datetime strings
+ yearfirst parsing behavior when encountering datetime strings
utc : bool, default False
- indicator whether the dates should be UTC
+ indicator whether the dates should be UTC
+ creso : NPY_DATETIMEUNIT, default NPY_FR_ns
+ Set to NPY_FR_GENERIC to infer a resolution.
Returns
-------
np.ndarray
- May be datetime64[ns] or object dtype
+ May be datetime64[creso_unit] or object dtype
tzinfo or None
"""
cdef:
Py_ssize_t i, n = values.size
- object val, tz
+ object val
ndarray[int64_t] iresult
npy_datetimestruct dts
bint utc_convert = bool(utc)
@@ -450,48 +464,61 @@ cpdef array_to_datetime(
bint is_ignore = errors == "ignore"
bint is_coerce = errors == "coerce"
bint is_same_offsets
- _TSObject _ts
+ _TSObject tsobj
float tz_offset
set out_tzoffset_vals = set()
- tzinfo tz_out = None
- bint found_tz = False, found_naive = False
- cnp.broadcast mi
+ tzinfo tz, tz_out = None
+ cnp.flatiter it = cnp.PyArray_IterNew(values)
+ NPY_DATETIMEUNIT item_reso
+ bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
+ DatetimeParseState state = DatetimeParseState(creso)
+ str abbrev
# specify error conditions
assert is_raise or is_ignore or is_coerce
- result = np.empty((values).shape, dtype="M8[ns]")
- mi = cnp.PyArray_MultiIterNew2(result, values)
+ if infer_reso:
+ abbrev = "ns"
+ else:
+ abbrev = npy_unit_to_abbrev(creso)
+ result = np.empty((values).shape, dtype=f"M8[{abbrev}]")
iresult = result.view("i8").ravel()
for i in range(n):
# Analogous to `val = values[i]`
- val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0]
+ val = cnp.PyArray_GETITEM(values, cnp.PyArray_ITER_DATA(it))
+ cnp.PyArray_ITER_NEXT(it)
try:
if checknull_with_nat_and_na(val):
iresult[i] = NPY_NAT
elif PyDateTime_Check(val):
- if val.tzinfo is not None:
- found_tz = True
+ if isinstance(val, _Timestamp):
+ item_reso = val._creso
else:
- found_naive = True
- tz_out = convert_timezone(
- val.tzinfo,
- tz_out,
- found_naive,
- found_tz,
- utc_convert,
- )
- iresult[i] = parse_pydatetime(val, &dts, utc_convert)
+ item_reso = NPY_DATETIMEUNIT.NPY_FR_us
+ state.update_creso(item_reso)
+ if infer_reso:
+ creso = state.creso
+ tz_out = state.process_datetime(val, tz_out, utc_convert)
+ iresult[i] = parse_pydatetime(val, &dts, creso=creso)
elif PyDate_Check(val):
- iresult[i] = pydate_to_dt64(val, &dts)
- check_dts_bounds(&dts)
-
- elif is_datetime64_object(val):
- iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
+ item_reso = NPY_DATETIMEUNIT.NPY_FR_s
+ state.update_creso(item_reso)
+ if infer_reso:
+ creso = state.creso
+ iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
+ state.found_other = True
+
+ elif cnp.is_datetime64_object(val):
+ item_reso = get_supported_reso(get_datetime64_unit(val))
+ state.update_creso(item_reso)
+ if infer_reso:
+ creso = state.creso
+ iresult[i] = get_datetime64_nanos(val, creso)
+ state.found_other = True
elif is_integer_object(val) or is_float_object(val):
# these must be ns unit by-definition
@@ -499,8 +526,14 @@ cpdef array_to_datetime(
if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
else:
- # we now need to parse this as if unit='ns'
- iresult[i] = cast_from_unit(val, "ns")
+ item_reso = NPY_FR_ns
+ state.update_creso(item_reso)
+ if infer_reso:
+ creso = state.creso
+
+ # we now need to parse this as if unit=abbrev
+ iresult[i] = cast_from_unit(val, abbrev, out_reso=creso)
+ state.found_other = True
elif isinstance(val, str):
# string
@@ -508,45 +541,56 @@ cpdef array_to_datetime(
# GH#32264 np.str_ object
val = str(val)
- if parse_today_now(val, &iresult[i], utc):
+ if parse_today_now(val, &iresult[i], utc, creso):
# We can't _quite_ dispatch this to convert_str_to_tsobject
# bc there isn't a nice way to pass "utc"
- cnp.PyArray_MultiIter_NEXT(mi)
+ item_reso = NPY_DATETIMEUNIT.NPY_FR_us
+ state.update_creso(item_reso)
+ if infer_reso:
+ creso = state.creso
continue
- _ts = convert_str_to_tsobject(
- val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
+ tsobj = convert_str_to_tsobject(
+ val, None, dayfirst=dayfirst, yearfirst=yearfirst
)
- _ts.ensure_reso(NPY_FR_ns, val)
- iresult[i] = _ts.value
+ if tsobj.value == NPY_NAT:
+ # e.g. "NaT" string or empty string, we do not consider
+ # this as either tzaware or tznaive. See
+ # test_to_datetime_with_empty_str_utc_false_format_mixed
+ # We also do not update resolution inference based on this,
+ # see test_infer_with_nat_int_float_str
+ iresult[i] = tsobj.value
+ continue
+
+ item_reso = tsobj.creso
+ state.update_creso(item_reso)
+ if infer_reso:
+ creso = state.creso
+
+ tsobj.ensure_reso(creso, val)
+ iresult[i] = tsobj.value
- tz = _ts.tzinfo
+ tz = tsobj.tzinfo
if tz is not None:
# dateutil timezone objects cannot be hashed, so
# store the UTC offsets in seconds instead
nsecs = tz.utcoffset(None).total_seconds()
out_tzoffset_vals.add(nsecs)
- # need to set seen_datetime_offset *after* the
- # potentially-raising timezone(timedelta(...)) call,
- # otherwise we can go down the is_same_offsets path
- # bc len(out_tzoffset_vals) == 0
seen_datetime_offset = True
else:
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add("naive")
+ state.found_naive_str = True
else:
raise TypeError(f"{type(val)} is not convertible to datetime")
- cnp.PyArray_MultiIter_NEXT(mi)
-
except (TypeError, OverflowError, ValueError) as ex:
ex.args = (f"{ex}, at position {i}",)
if is_coerce:
iresult[i] = NPY_NAT
- cnp.PyArray_MultiIter_NEXT(mi)
continue
elif is_raise:
raise
@@ -562,9 +606,51 @@ cpdef array_to_datetime(
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
+ elif state.found_naive or state.found_other:
+ # e.g. test_to_datetime_mixed_awareness_mixed_types
+ raise ValueError("Cannot mix tz-aware with tz-naive values")
+ elif tz_out is not None:
+ # GH#55693
+ tz_offset = out_tzoffset_vals.pop()
+ tz_out2 = timezone(timedelta(seconds=tz_offset))
+ if not tz_compare(tz_out, tz_out2):
+ # e.g. test_to_datetime_mixed_tzs_mixed_types
+ raise ValueError(
+ "Mixed timezones detected. pass utc=True in to_datetime "
+ "or tz='UTC' in DatetimeIndex to convert to a common timezone."
+ )
+ # e.g. test_to_datetime_mixed_types_matching_tzs
else:
tz_offset = out_tzoffset_vals.pop()
tz_out = timezone(timedelta(seconds=tz_offset))
+ elif not utc_convert:
+ if tz_out and (state.found_other or state.found_naive_str):
+ # found_other indicates a tz-naive int, float, dt64, or date
+ # e.g. test_to_datetime_mixed_awareness_mixed_types
+ raise ValueError("Cannot mix tz-aware with tz-naive values")
+
+ if infer_reso:
+ if state.creso_ever_changed:
+ # We encountered mismatched resolutions, need to re-parse with
+ # the correct one.
+ return array_to_datetime(
+ values,
+ errors=errors,
+ yearfirst=yearfirst,
+ dayfirst=dayfirst,
+ utc=utc,
+ creso=state.creso,
+ )
+ elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
+ # i.e. we never encountered anything non-NaT, default to "s". This
+ # ensures that insert and concat-like operations with NaT
+ # do not upcast units
+ result = iresult.view("M8[s]").reshape(result.shape)
+ else:
+ # Otherwise we can use the single reso that we encountered and avoid
+ # a second pass.
+ abbrev = npy_unit_to_abbrev(state.creso)
+ result = iresult.view(f"M8[{abbrev}]").reshape(result.shape)
return result, tz_out
@@ -620,7 +706,6 @@ cdef _array_to_datetime_object(
# 1) NaT or NaT-like values
# 2) datetime strings, which we return as datetime.datetime
# 3) special strings - "now" & "today"
- unique_timezones = set()
for i in range(n):
# Analogous to: val = values[i]
val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0]
@@ -640,7 +725,7 @@ cdef _array_to_datetime_object(
try:
tsobj = convert_str_to_tsobject(
- val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
+ val, None, dayfirst=dayfirst, yearfirst=yearfirst
)
tsobj.ensure_reso(NPY_FR_ns, val)
@@ -650,7 +735,6 @@ cdef _array_to_datetime_object(
tzinfo=tsobj.tzinfo,
fold=tsobj.fold,
)
- unique_timezones.add(tsobj.tzinfo)
except (ValueError, OverflowError) as ex:
ex.args = (f"{ex}, at position {i}", )
@@ -668,20 +752,21 @@ cdef _array_to_datetime_object(
cnp.PyArray_MultiIter_NEXT(mi)
- if len(unique_timezones) > 1:
- warnings.warn(
- "In a future version of pandas, parsing datetimes with mixed time "
- "zones will raise a warning unless `utc=True`. "
- "Please specify `utc=True` to opt in to the new behaviour "
- "and silence this warning. To create a `Series` with mixed offsets and "
- "`object` dtype, please use `apply` and `datetime.datetime.strptime`",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
+ warnings.warn(
+ "In a future version of pandas, parsing datetimes with mixed time "
+ "zones will raise an error unless `utc=True`. "
+ "Please specify `utc=True` to opt in to the new behaviour "
+ "and silence this warning. To create a `Series` with mixed offsets and "
+ "`object` dtype, please use `apply` and `datetime.datetime.strptime`",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
return oresult_nd, None
-def array_to_datetime_with_tz(ndarray values, tzinfo tz):
+def array_to_datetime_with_tz(
+ ndarray values, tzinfo tz, bint dayfirst, bint yearfirst, NPY_DATETIMEUNIT creso
+):
"""
Vectorized analogue to pd.Timestamp(value, tz=tz)
@@ -697,7 +782,16 @@ def array_to_datetime_with_tz(ndarray values, tzinfo tz):
Py_ssize_t i, n = values.size
object item
int64_t ival
- datetime ts
+ _TSObject tsobj
+ bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
+ DatetimeParseState state = DatetimeParseState(creso)
+ str abbrev
+
+ if infer_reso:
+ # We treat ints/floats as nanoseconds
+ abbrev = "ns"
+ else:
+ abbrev = npy_unit_to_abbrev(creso)
for i in range(n):
# Analogous to `item = values[i]`
@@ -708,21 +802,42 @@ def array_to_datetime_with_tz(ndarray values, tzinfo tz):
ival = NPY_NAT
else:
- ts = Timestamp(item)
- if ts is NaT:
- ival = NPY_NAT
- else:
- if ts.tzinfo is not None:
- ts = ts.tz_convert(tz)
- else:
- # datetime64, tznaive pydatetime, int, float
- ts = ts.tz_localize(tz)
- ts = ts.as_unit("ns")
- ival = ts._value
+ tsobj = convert_to_tsobject(
+ item,
+ tz=tz,
+ unit=abbrev,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst,
+ nanos=0,
+ )
+ if tsobj.value != NPY_NAT:
+ state.update_creso(tsobj.creso)
+ if infer_reso:
+ creso = state.creso
+ tsobj.ensure_reso(creso, item, round_ok=True)
+ ival = tsobj.value
# Analogous to: result[i] = ival
(cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival
cnp.PyArray_MultiIter_NEXT(mi)
+ if infer_reso:
+ if state.creso_ever_changed:
+ # We encountered mismatched resolutions, need to re-parse with
+ # the correct one.
+ return array_to_datetime_with_tz(values, tz=tz, creso=creso)
+ elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
+ # i.e. we never encountered anything non-NaT, default to "s". This
+ # ensures that insert and concat-like operations with NaT
+ # do not upcast units
+ result = result.view("M8[s]")
+ else:
+ # Otherwise we can use the single reso that we encountered and avoid
+ # a second pass.
+ abbrev = npy_unit_to_abbrev(creso)
+ result = result.view(f"M8[{abbrev}]")
+ else:
+ abbrev = npy_unit_to_abbrev(creso)
+ result = result.view(f"M8[{abbrev}]")
return result
diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py
index 2cabbe3ff07da..88a9a259ac8ec 100644
--- a/pandas/_libs/tslibs/__init__.py
+++ b/pandas/_libs/tslibs/__init__.py
@@ -30,18 +30,16 @@
"get_unit_from_dtype",
"periods_per_day",
"periods_per_second",
- "is_supported_unit",
- "npy_unit_to_abbrev",
- "get_supported_reso",
+ "guess_datetime_format",
+ "add_overflowsafe",
+ "get_supported_dtype",
+ "is_supported_dtype",
]
from pandas._libs.tslibs import dtypes # pylint: disable=import-self
from pandas._libs.tslibs.conversion import localize_pydatetime
from pandas._libs.tslibs.dtypes import (
Resolution,
- get_supported_reso,
- is_supported_unit,
- npy_unit_to_abbrev,
periods_per_day,
periods_per_second,
)
@@ -54,7 +52,10 @@
from pandas._libs.tslibs.np_datetime import (
OutOfBoundsDatetime,
OutOfBoundsTimedelta,
+ add_overflowsafe,
astype_overflowsafe,
+ get_supported_dtype,
+ is_supported_dtype,
is_unitless,
py_get_unit_from_dtype as get_unit_from_dtype,
)
@@ -63,6 +64,7 @@
Tick,
to_offset,
)
+from pandas._libs.tslibs.parsing import guess_datetime_format
from pandas._libs.tslibs.period import (
IncompatibleFrequency,
Period,
diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd
index ca9e01ffa3dbe..3c0f7ea824396 100644
--- a/pandas/_libs/tslibs/ccalendar.pxd
+++ b/pandas/_libs/tslibs/ccalendar.pxd
@@ -15,6 +15,6 @@ cpdef int32_t get_day_of_year(int year, int month, int day) noexcept nogil
cpdef int get_lastbday(int year, int month) noexcept nogil
cpdef int get_firstbday(int year, int month) noexcept nogil
-cdef dict c_MONTH_NUMBERS
+cdef dict c_MONTH_NUMBERS, MONTH_TO_CAL_NUM
cdef int32_t* month_offset
diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx
index 1db355fa91a20..536fa19f4c5d7 100644
--- a/pandas/_libs/tslibs/ccalendar.pyx
+++ b/pandas/_libs/tslibs/ccalendar.pyx
@@ -38,7 +38,7 @@ MONTHS_FULL = ["", "January", "February", "March", "April", "May", "June",
MONTH_NUMBERS = {name: num for num, name in enumerate(MONTHS)}
cdef dict c_MONTH_NUMBERS = MONTH_NUMBERS
MONTH_ALIASES = {(num + 1): name for num, name in enumerate(MONTHS)}
-MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)}
+cdef dict MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)}
DAYS = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]
DAYS_FULL = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd
index 1b3214605582a..6b9f41b1bb06f 100644
--- a/pandas/_libs/tslibs/conversion.pxd
+++ b/pandas/_libs/tslibs/conversion.pxd
@@ -24,7 +24,9 @@ cdef class _TSObject:
bint fold
NPY_DATETIMEUNIT creso
- cdef int64_t ensure_reso(self, NPY_DATETIMEUNIT creso, str val=*) except? -1
+ cdef int64_t ensure_reso(
+ self, NPY_DATETIMEUNIT creso, val=*, bint round_ok=*
+ ) except? -1
cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
@@ -35,7 +37,7 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz,
int32_t nanos=*,
NPY_DATETIMEUNIT reso=*)
-cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
+cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
bint dayfirst=*,
bint yearfirst=*)
@@ -43,20 +45,15 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1
cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
-cpdef (int64_t, int) precision_from_unit(str unit, NPY_DATETIMEUNIT out_reso=*)
+cdef (int64_t, int) precision_from_unit(
+ NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*
+)
cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso)
-cdef tzinfo convert_timezone(
- tzinfo tz_in,
- tzinfo tz_out,
- bint found_naive,
- bint found_tz,
- bint utc_convert,
-)
cdef int64_t parse_pydatetime(
datetime val,
npy_datetimestruct *dts,
- bint utc_convert,
+ NPY_DATETIMEUNIT creso,
) except? -1
diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi
index d564d767f7f05..26affae577f4d 100644
--- a/pandas/_libs/tslibs/conversion.pyi
+++ b/pandas/_libs/tslibs/conversion.pyi
@@ -8,7 +8,7 @@ import numpy as np
DT64NS_DTYPE: np.dtype
TD64NS_DTYPE: np.dtype
-def precision_from_unit(
- unit: str,
-) -> tuple[int, int]: ... # (int64_t, _)
def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
+def cast_from_unit_vectorized(
+ values: np.ndarray, unit: str, out_unit: str = ...
+) -> np.ndarray: ...
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index 45c4d7809fe7a..3a55f5fa0c003 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -1,8 +1,11 @@
+cimport cython
+
import numpy as np
cimport numpy as cnp
from libc.math cimport log10
from numpy cimport (
+ float64_t,
int32_t,
int64_t,
)
@@ -26,21 +29,22 @@ from cpython.datetime cimport (
import_datetime()
from pandas._libs.missing cimport checknull_with_nat_and_na
-from pandas._libs.tslibs.base cimport ABCTimestamp
from pandas._libs.tslibs.dtypes cimport (
abbrev_to_npy_unit,
get_supported_reso,
+ npy_unit_to_attrname,
periods_per_second,
)
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
NPY_FR_us,
+ astype_overflowsafe,
check_dts_bounds,
convert_reso,
+ dts_to_iso_string,
get_conversion_factor,
get_datetime64_unit,
- get_datetime64_value,
get_implementation_bounds,
import_pandas_datetime,
npy_datetime,
@@ -61,6 +65,7 @@ from pandas._libs.tslibs.nattype cimport (
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.parsing cimport parse_datetime_string
+from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs.tslibs.timezones cimport (
get_utcoffset,
is_utc,
@@ -70,9 +75,9 @@ from pandas._libs.tslibs.tzconversion cimport (
tz_localize_to_utc_single,
)
from pandas._libs.tslibs.util cimport (
- is_datetime64_object,
is_float_object,
is_integer_object,
+ is_nan,
)
# ----------------------------------------------------------------------
@@ -85,6 +90,79 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
# ----------------------------------------------------------------------
# Unit Conversion Helpers
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.overflowcheck(True)
+def cast_from_unit_vectorized(
+ ndarray values,
+ str unit,
+ str out_unit="ns",
+):
+ """
+ Vectorized analogue to cast_from_unit.
+ """
+ cdef:
+ int64_t m
+ int p
+ NPY_DATETIMEUNIT in_reso, out_reso
+ Py_ssize_t i
+
+ assert values.dtype.kind == "f"
+
+ if unit in "YM":
+ if not (((values % 1) == 0) | np.isnan(values)).all():
+ # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
+ # but not clear what 2.5 "M" corresponds to, so we will
+ # disallow that case.
+ raise ValueError(
+ f"Conversion of non-round float with unit={unit} "
+ "is ambiguous"
+ )
+
+ # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
+ # and 150 we'd get 2120-01-01 09:00:00
+ values = values.astype(f"M8[{unit}]")
+ dtype = np.dtype(f"M8[{out_unit}]")
+ return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8")
+
+ in_reso = abbrev_to_npy_unit(unit)
+ out_reso = abbrev_to_npy_unit(out_unit)
+ m, p = precision_from_unit(in_reso, out_reso)
+
+ cdef:
+ ndarray[int64_t] base, out
+ ndarray[float64_t] frac
+ tuple shape = (values).shape
+
+ out = np.empty(shape, dtype="i8")
+ base = np.empty(shape, dtype="i8")
+ frac = np.empty(shape, dtype="f8")
+
+ for i in range(len(values)):
+ if is_nan(values[i]):
+ base[i] = NPY_NAT
+ else:
+ base[i] = values[i]
+ frac[i] = values[i] - base[i]
+
+ if p:
+ frac = np.round(frac, p)
+
+ try:
+ for i in range(len(values)):
+ if base[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ else:
+ out[i] = (base[i] * m) + (frac[i] * m)
+ except (OverflowError, FloatingPointError) as err:
+ # FloatingPointError can be issued if we have float dtype and have
+ # set np.errstate(over="raise")
+ raise OutOfBoundsDatetime(
+ f"cannot convert input {values[i]} with the unit '{unit}'"
+ ) from err
+ return out
+
+
cdef int64_t cast_from_unit(
object ts,
str unit,
@@ -106,6 +184,7 @@ cdef int64_t cast_from_unit(
cdef:
int64_t m
int p
+ NPY_DATETIMEUNIT in_reso
if unit in ["Y", "M"]:
if is_float_object(ts) and not ts.is_integer():
@@ -123,7 +202,14 @@ cdef int64_t cast_from_unit(
dt64obj = np.datetime64(ts, unit)
return get_datetime64_nanos(dt64obj, out_reso)
- m, p = precision_from_unit(unit, out_reso)
+ in_reso = abbrev_to_npy_unit(unit)
+ if out_reso < in_reso and in_reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC:
+ # We will end up rounding (always *down*), so don't need the fractional
+ # part of `ts`.
+ m, _ = precision_from_unit(out_reso, in_reso)
+ return (ts) // m
+
+ m, p = precision_from_unit(in_reso, out_reso)
# cast the unit, multiply base/frac separately
# to avoid precision issues from float -> int
@@ -146,8 +232,8 @@ cdef int64_t cast_from_unit(
) from err
-cpdef inline (int64_t, int) precision_from_unit(
- str unit,
+cdef (int64_t, int) precision_from_unit(
+ NPY_DATETIMEUNIT in_reso,
NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
):
"""
@@ -163,17 +249,16 @@ cpdef inline (int64_t, int) precision_from_unit(
int64_t m
int64_t multiplier
int p
- NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit)
- if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- reso = NPY_DATETIMEUNIT.NPY_FR_ns
- if reso == NPY_DATETIMEUNIT.NPY_FR_Y:
+ if in_reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
+ in_reso = NPY_DATETIMEUNIT.NPY_FR_ns
+ if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y:
# each 400 years we have 97 leap years, for an average of 97/400=.2425
# extra days each year. We get 31556952 by writing
# 3600*24*365.2425=31556952
multiplier = periods_per_second(out_reso)
m = multiplier * 31556952
- elif reso == NPY_DATETIMEUNIT.NPY_FR_M:
+ elif in_reso == NPY_DATETIMEUNIT.NPY_FR_M:
# 2629746 comes from dividing the "Y" case by 12.
multiplier = periods_per_second(out_reso)
m = multiplier * 2629746
@@ -181,7 +266,7 @@ cpdef inline (int64_t, int) precision_from_unit(
# Careful: if get_conversion_factor raises, the exception does
# not propagate, instead we get a warning about an ignored exception.
# https://github.com/pandas-dev/pandas/pull/51483#discussion_r1115198951
- m = get_conversion_factor(reso, out_reso)
+ m = get_conversion_factor(in_reso, out_reso)
p = log10(m) # number of digits in 'm' minus 1
return m, p
@@ -197,7 +282,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1:
NPY_DATETIMEUNIT unit
npy_datetime ival
- ival = get_datetime64_value(val)
+ ival = cnp.get_datetime64_value(val)
if ival == NPY_NAT:
return NPY_NAT
@@ -205,8 +290,13 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1:
if unit != reso:
pandas_datetime_to_datetimestruct(ival, unit, &dts)
- check_dts_bounds(&dts, reso)
- ival = npy_datetimestruct_to_datetime(reso, &dts)
+ try:
+ ival = npy_datetimestruct_to_datetime(reso, &dts)
+ except OverflowError as err:
+ attrname = npy_unit_to_attrname[reso]
+ raise OutOfBoundsDatetime(
+ f"Out of bounds {attrname} timestamp: {val}"
+ ) from err
return ival
@@ -228,14 +318,19 @@ cdef class _TSObject:
self.fold = 0
self.creso = NPY_FR_ns # default value
- cdef int64_t ensure_reso(self, NPY_DATETIMEUNIT creso, str val=None) except? -1:
+ cdef int64_t ensure_reso(
+ self, NPY_DATETIMEUNIT creso, val=None, bint round_ok=False
+ ) except? -1:
if self.creso != creso:
try:
- self.value = convert_reso(self.value, self.creso, creso, False)
+ self.value = convert_reso(
+ self.value, self.creso, creso, round_ok=round_ok
+ )
except OverflowError as err:
if val is not None:
+ attrname = npy_unit_to_attrname[creso]
raise OutOfBoundsDatetime(
- f"Out of bounds nanosecond timestamp: {val}"
+ f"Out of bounds {attrname} timestamp: {val}"
) from err
raise OutOfBoundsDatetime from err
@@ -266,16 +361,21 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
obj = _TSObject()
if isinstance(ts, str):
- return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst)
+ return convert_str_to_tsobject(ts, tz, dayfirst, yearfirst)
if checknull_with_nat_and_na(ts):
obj.value = NPY_NAT
- elif is_datetime64_object(ts):
+ elif cnp.is_datetime64_object(ts):
reso = get_supported_reso(get_datetime64_unit(ts))
obj.creso = reso
obj.value = get_datetime64_nanos(ts, reso)
if obj.value != NPY_NAT:
pandas_datetime_to_datetimestruct(obj.value, reso, &obj.dts)
+ if tz is not None:
+ # GH#24559, GH#42288 We treat np.datetime64 objects as *wall* times
+ obj.value = tz_localize_to_utc_single(
+ obj.value, tz, ambiguous="raise", nonexistent=None, creso=reso
+ )
elif is_integer_object(ts):
try:
ts = ts
@@ -302,8 +402,8 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts)
elif PyDateTime_Check(ts):
if nanos == 0:
- if isinstance(ts, ABCTimestamp):
- reso = abbrev_to_npy_unit(ts.unit) # TODO: faster way to do this?
+ if isinstance(ts, _Timestamp):
+ reso = (<_Timestamp>ts)._creso
else:
# TODO: what if user explicitly passes nanos=0?
reso = NPY_FR_us
@@ -391,62 +491,43 @@ cdef _TSObject convert_datetime_to_tsobject(
pydatetime_to_dtstruct(ts, &obj.dts)
obj.tzinfo = ts.tzinfo
- if isinstance(ts, ABCTimestamp):
+ if isinstance(ts, _Timestamp):
obj.dts.ps = ts.nanosecond * 1000
if nanos:
obj.dts.ps = nanos * 1000
- obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts)
+ try:
+ obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts)
+ except OverflowError as err:
+ attrname = npy_unit_to_attrname[reso]
+ raise OutOfBoundsDatetime(f"Out of bounds {attrname} timestamp") from err
if obj.tzinfo is not None and not is_utc(obj.tzinfo):
offset = get_utcoffset(obj.tzinfo, ts)
pps = periods_per_second(reso)
obj.value -= int(offset.total_seconds() * pps)
- check_dts_bounds(&obj.dts, reso)
check_overflows(obj, reso)
return obj
-cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts,
- int tzoffset, tzinfo tz=None,
- NPY_DATETIMEUNIT reso=NPY_FR_ns):
+cdef _adjust_tsobject_tz_using_offset(_TSObject obj, tzinfo tz):
"""
- Convert a datetimestruct `dts`, along with initial timezone offset
- `tzoffset` to a _TSObject (with timezone object `tz` - optional).
+ Convert a datetimestruct `obj.dts`, with an attached tzinfo to a new
+ user-provided tz.
Parameters
----------
- dts : npy_datetimestruct
- tzoffset : int
- tz : tzinfo or None
- timezone for the timezone-aware output.
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
obj : _TSObject
+ tz : tzinfo
+ timezone for the timezone-aware output.
"""
cdef:
- _TSObject obj = _TSObject()
- int64_t value # numpy dt64
datetime dt
Py_ssize_t pos
-
- value = npy_datetimestruct_to_datetime(reso, &dts)
- obj.dts = dts
- obj.tzinfo = timezone(timedelta(minutes=tzoffset))
- obj.value = tz_localize_to_utc_single(
- value, obj.tzinfo, ambiguous=None, nonexistent=None, creso=reso
- )
- obj.creso = reso
- if tz is None:
- check_overflows(obj, reso)
- return obj
-
- cdef:
- Localizer info = Localizer(tz, reso)
+ int64_t ps = obj.dts.ps
+ Localizer info = Localizer(tz, obj.creso)
# Infer fold from offset-adjusted obj.value
# see PEP 495 https://www.python.org/dev/peps/pep-0495/#the-fold-attribute
@@ -462,13 +543,18 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts,
dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day,
obj.dts.hour, obj.dts.min, obj.dts.sec,
obj.dts.us, obj.tzinfo, fold=obj.fold)
- obj = convert_datetime_to_tsobject(
- dt, tz, nanos=obj.dts.ps // 1000)
- obj.ensure_reso(reso) # TODO: more performant to get reso right up front?
- return obj
+ # The rest here is similar to the 2-tz path in convert_datetime_to_tsobject
+ # but avoids re-calculating obj.value
+ dt = dt.astimezone(tz)
+ pydatetime_to_dtstruct(dt, &obj.dts)
+ obj.tzinfo = dt.tzinfo
+ obj.dts.ps = ps
+ check_dts_bounds(&obj.dts, obj.creso)
+ check_overflows(obj, obj.creso)
-cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
+
+cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
bint dayfirst=False,
bint yearfirst=False):
"""
@@ -484,7 +570,6 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
Value to be converted to _TSObject
tz : tzinfo or None
timezone for the timezone-aware output
- unit : str or None
dayfirst : bool, default False
When parsing an ambiguous date string, interpret e.g. "3/4/1975" as
April 3, as opposed to the standard US interpretation March 4.
@@ -500,8 +585,9 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
npy_datetimestruct dts
int out_local = 0, out_tzoffset = 0, string_to_dts_failed
datetime dt
- int64_t ival
+ int64_t ival, nanos = 0
NPY_DATETIMEUNIT out_bestunit, reso
+ _TSObject obj
if len(ts) == 0 or ts in nat_strings:
obj = _TSObject()
@@ -512,11 +598,13 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
# Issue 9000, we short-circuit rather than going
# into np_datetime_strings which returns utc
dt = datetime.now(tz)
+ return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us)
elif ts == "today":
# Issue 9000, we short-circuit rather than going
# into np_datetime_strings which returns a normalized datetime
dt = datetime.now(tz)
# equiv: datetime.today().replace(tzinfo=tz)
+ return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us)
else:
string_to_dts_failed = string_to_dts(
ts, &dts, &out_bestunit, &out_local,
@@ -525,31 +613,40 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
if not string_to_dts_failed:
reso = get_supported_reso(out_bestunit)
check_dts_bounds(&dts, reso)
+ obj = _TSObject()
+ obj.dts = dts
+ obj.creso = reso
+ ival = npy_datetimestruct_to_datetime(reso, &dts)
+
if out_local == 1:
- return _create_tsobject_tz_using_offset(
- dts, out_tzoffset, tz, reso
+ obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
+ obj.value = tz_localize_to_utc_single(
+ ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso
)
+ if tz is None:
+ check_overflows(obj, reso)
+ return obj
+ _adjust_tsobject_tz_using_offset(obj, tz)
+ return obj
else:
- ival = npy_datetimestruct_to_datetime(reso, &dts)
if tz is not None:
# shift for _localize_tso
ival = tz_localize_to_utc_single(
ival, tz, ambiguous="raise", nonexistent=None, creso=reso
)
- obj = _TSObject()
- obj.dts = dts
obj.value = ival
- obj.creso = reso
maybe_localize_tso(obj, tz, obj.creso)
return obj
dt = parse_datetime_string(
- ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit
+ ts,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst,
+ out_bestunit=&out_bestunit,
+ nanos=&nanos,
)
reso = get_supported_reso(out_bestunit)
- return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso)
-
- return convert_datetime_to_tsobject(dt, tz)
+ return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso)
cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns):
@@ -578,18 +675,18 @@ cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns):
if obj.dts.year == lb.year:
if not (obj.value < 0):
from pandas._libs.tslibs.timestamps import Timestamp
- fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} "
- f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}")
+ fmt = dts_to_iso_string(&obj.dts)
+ min_ts = (<_Timestamp>Timestamp(0))._as_creso(reso).min
raise OutOfBoundsDatetime(
- f"Converting {fmt} underflows past {Timestamp.min}"
+ f"Converting {fmt} underflows past {min_ts}"
)
elif obj.dts.year == ub.year:
if not (obj.value > 0):
from pandas._libs.tslibs.timestamps import Timestamp
- fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} "
- f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}")
+ fmt = dts_to_iso_string(&obj.dts)
+ max_ts = (<_Timestamp>Timestamp(0))._as_creso(reso).max
raise OutOfBoundsDatetime(
- f"Converting {fmt} overflows past {Timestamp.max}"
+ f"Converting {fmt} overflows past {max_ts}"
)
# ----------------------------------------------------------------------
@@ -647,7 +744,8 @@ cdef datetime _localize_pydatetime(datetime dt, tzinfo tz):
"""
try:
# datetime.replace with pytz may be incorrect result
- return tz.localize(dt)
+ # TODO: try to respect `fold` attribute
+ return tz.localize(dt, is_dst=None)
except AttributeError:
return dt.replace(tzinfo=tz)
@@ -667,68 +765,15 @@ cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz):
"""
if tz is None:
return dt
- elif isinstance(dt, ABCTimestamp):
+ elif isinstance(dt, _Timestamp):
return dt.tz_localize(tz)
return _localize_pydatetime(dt, tz)
-cdef tzinfo convert_timezone(
- tzinfo tz_in,
- tzinfo tz_out,
- bint found_naive,
- bint found_tz,
- bint utc_convert,
-):
- """
- Validate that ``tz_in`` can be converted/localized to ``tz_out``.
-
- Parameters
- ----------
- tz_in : tzinfo or None
- Timezone info of element being processed.
- tz_out : tzinfo or None
- Timezone info of output.
- found_naive : bool
- Whether a timezone-naive element has been found so far.
- found_tz : bool
- Whether a timezone-aware element has been found so far.
- utc_convert : bool
- Whether to convert/localize to UTC.
-
- Returns
- -------
- tz_info
- Timezone info of output.
-
- Raises
- ------
- ValueError
- If ``tz_in`` can't be converted/localized to ``tz_out``.
- """
- if tz_in is not None:
- if utc_convert:
- pass
- elif found_naive:
- raise ValueError("Tz-aware datetime.datetime "
- "cannot be converted to "
- "datetime64 unless utc=True")
- elif tz_out is not None and not tz_compare(tz_out, tz_in):
- raise ValueError("Tz-aware datetime.datetime "
- "cannot be converted to "
- "datetime64 unless utc=True")
- else:
- tz_out = tz_in
- else:
- if found_tz and not utc_convert:
- raise ValueError("Cannot mix tz-aware with "
- "tz-naive values")
- return tz_out
-
-
cdef int64_t parse_pydatetime(
datetime val,
npy_datetimestruct *dts,
- bint utc_convert,
+ NPY_DATETIMEUNIT creso,
) except? -1:
"""
Convert pydatetime to datetime64.
@@ -739,8 +784,8 @@ cdef int64_t parse_pydatetime(
Element being processed.
dts : *npy_datetimestruct
Needed to use in pydatetime_to_dt64, which writes to it.
- utc_convert : bool
- Whether to convert/localize to UTC.
+ creso : NPY_DATETIMEUNIT
+ Resolution to store the the result.
Raises
------
@@ -751,18 +796,11 @@ cdef int64_t parse_pydatetime(
int64_t result
if val.tzinfo is not None:
- if utc_convert:
- _ts = convert_datetime_to_tsobject(val, None)
- _ts.ensure_reso(NPY_FR_ns)
- result = _ts.value
- else:
- _ts = convert_datetime_to_tsobject(val, None)
- _ts.ensure_reso(NPY_FR_ns)
- result = _ts.value
+ _ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso)
+ result = _ts.value
else:
- if isinstance(val, ABCTimestamp):
- result = val.as_unit("ns")._value
+ if isinstance(val, _Timestamp):
+ result = (<_Timestamp>val)._as_creso(creso, round_ok=True)._value
else:
- result = pydatetime_to_dt64(val, dts)
- check_dts_bounds(dts)
+ result = pydatetime_to_dt64(val, dts, reso=creso)
return result
diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd
index a8dd88c763c14..88cfa6ca60d93 100644
--- a/pandas/_libs/tslibs/dtypes.pxd
+++ b/pandas/_libs/tslibs/dtypes.pxd
@@ -3,14 +3,19 @@ from numpy cimport int64_t
from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
-cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
+cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil
cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1
-cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso)
-cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso)
-
+cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso)
+cdef bint is_supported_unit(NPY_DATETIMEUNIT reso)
+
+cpdef freq_to_period_freqstr(freq_n, freq_name)
+cdef dict c_OFFSET_TO_PERIOD_FREQSTR
+cdef dict c_OFFSET_DEPR_FREQSTR
+cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR
+cdef dict c_DEPR_ABBREVS
cdef dict attrname_to_abbrevs
cdef dict npy_unit_to_attrname
cdef dict attrname_to_npy_unit
diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi
index bea3e18273318..7fdeb88d498ac 100644
--- a/pandas/_libs/tslibs/dtypes.pyi
+++ b/pandas/_libs/tslibs/dtypes.pyi
@@ -1,16 +1,11 @@
from enum import Enum
-# These are not public API, but are exposed in the .pyi file because they
-# are imported in tests.
-_attrname_to_abbrevs: dict[str, str]
-_period_code_map: dict[str, int]
+OFFSET_TO_PERIOD_FREQSTR: dict[str, str]
-def periods_per_day(reso: int) -> int: ...
+def periods_per_day(reso: int = ...) -> int: ...
def periods_per_second(reso: int) -> int: ...
-def is_supported_unit(reso: int) -> bool: ...
-def npy_unit_to_abbrev(reso: int) -> str: ...
-def get_supported_reso(reso: int) -> int: ...
-def abbrev_to_npy_unit(abbrev: str) -> int: ...
+def abbrev_to_npy_unit(abbrev: str | None) -> int: ...
+def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ...
class PeriodDtypeBase:
_dtype_code: int # PeriodDtypeCode
diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
index 19f4c83e6cecf..52e1133e596c5 100644
--- a/pandas/_libs/tslibs/dtypes.pyx
+++ b/pandas/_libs/tslibs/dtypes.pyx
@@ -1,7 +1,11 @@
# period frequency constants corresponding to scikits timeseries
# originals
from enum import Enum
+import warnings
+from pandas.util._exceptions import find_stack_level
+
+from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
get_conversion_factor,
@@ -10,7 +14,6 @@ from pandas._libs.tslibs.np_datetime cimport (
import_pandas_datetime()
-
cdef class PeriodDtypeBase:
"""
Similar to an actual dtype, this contains all of the information
@@ -44,7 +47,7 @@ cdef class PeriodDtypeBase:
def _resolution_obj(self) -> "Resolution":
fgc = self._freq_group_code
freq_group = FreqGroup(fgc)
- abbrev = _reverse_period_code_map[freq_group.value].split("-")[0]
+ abbrev = _period_code_to_abbrev[freq_group.value].split("-")[0]
if abbrev == "B":
return Resolution.RESO_DAY
attrname = _abbrev_to_attrnames[abbrev]
@@ -53,7 +56,7 @@ cdef class PeriodDtypeBase:
@property
def _freqstr(self) -> str:
# Will be passed to to_offset in Period._maybe_convert_freq
- out = _reverse_period_code_map.get(self._dtype_code)
+ out = _period_code_to_abbrev.get(self._dtype_code)
if self._n == 1:
return out
return str(self._n) + out
@@ -99,19 +102,19 @@ cdef class PeriodDtypeBase:
_period_code_map = {
# Annual freqs with various fiscal year ends.
- # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005
- "A-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end
- "A-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end
- "A-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end
- "A-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end
- "A-APR": PeriodDtypeCode.A_APR, # Annual - April year end
- "A-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end
- "A-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end
- "A-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end
- "A-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end
- "A-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end
- "A-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end
- "A-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end
+ # eg, 2005 for Y-FEB runs Mar 1, 2004 to Feb 28, 2005
+ "Y-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end
+ "Y-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end
+ "Y-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end
+ "Y-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end
+ "Y-APR": PeriodDtypeCode.A_APR, # Annual - April year end
+ "Y-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end
+ "Y-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end
+ "Y-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end
+ "Y-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end
+ "Y-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end
+ "Y-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end
+ "Y-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end
# Quarterly frequencies with various fiscal year ends.
# eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005
@@ -140,49 +143,275 @@ _period_code_map = {
"B": PeriodDtypeCode.B, # Business days
"D": PeriodDtypeCode.D, # Daily
- "H": PeriodDtypeCode.H, # Hourly
- "T": PeriodDtypeCode.T, # Minutely
- "S": PeriodDtypeCode.S, # Secondly
- "L": PeriodDtypeCode.L, # Millisecondly
- "U": PeriodDtypeCode.U, # Microsecondly
- "N": PeriodDtypeCode.N, # Nanosecondly
+ "h": PeriodDtypeCode.H, # Hourly
+ "min": PeriodDtypeCode.T, # Minutely
+ "s": PeriodDtypeCode.S, # Secondly
+ "ms": PeriodDtypeCode.L, # Millisecondly
+ "us": PeriodDtypeCode.U, # Microsecondly
+ "ns": PeriodDtypeCode.N, # Nanosecondly
}
-_reverse_period_code_map = {
+cdef dict _period_code_to_abbrev = {
_period_code_map[key]: key for key in _period_code_map}
-# Yearly aliases; careful not to put these in _reverse_period_code_map
-_period_code_map.update({"Y" + key[1:]: _period_code_map[key]
- for key in _period_code_map
- if key.startswith("A-")})
-
-_period_code_map.update({
- "Q": 2000, # Quarterly - December year end (default quarterly)
- "A": PeriodDtypeCode.A, # Annual
- "W": 4000, # Weekly
- "C": 5000, # Custom Business Day
-})
-
-cdef set _month_names = {
- x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("A-")
-}
+cdef set _month_names = set(c_MONTH_NUMBERS.keys())
# Map attribute-name resolutions to resolution abbreviations
-_attrname_to_abbrevs = {
- "year": "A",
+cdef dict attrname_to_abbrevs = {
+ "year": "Y",
"quarter": "Q",
"month": "M",
"day": "D",
- "hour": "H",
- "minute": "T",
- "second": "S",
- "millisecond": "L",
- "microsecond": "U",
- "nanosecond": "N",
+ "hour": "h",
+ "minute": "min",
+ "second": "s",
+ "millisecond": "ms",
+ "microsecond": "us",
+ "nanosecond": "ns",
}
-cdef dict attrname_to_abbrevs = _attrname_to_abbrevs
cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()}
+OFFSET_TO_PERIOD_FREQSTR: dict = {
+ "WEEKDAY": "D",
+ "EOM": "M",
+ "BME": "M",
+ "SME": "M",
+ "BQS": "Q",
+ "QS": "Q",
+ "BQE": "Q",
+ "BQE-DEC": "Q-DEC",
+ "BQE-JAN": "Q-JAN",
+ "BQE-FEB": "Q-FEB",
+ "BQE-MAR": "Q-MAR",
+ "BQE-APR": "Q-APR",
+ "BQE-MAY": "Q-MAY",
+ "BQE-JUN": "Q-JUN",
+ "BQE-JUL": "Q-JUL",
+ "BQE-AUG": "Q-AUG",
+ "BQE-SEP": "Q-SEP",
+ "BQE-OCT": "Q-OCT",
+ "BQE-NOV": "Q-NOV",
+ "MS": "M",
+ "D": "D",
+ "B": "B",
+ "min": "min",
+ "s": "s",
+ "ms": "ms",
+ "us": "us",
+ "ns": "ns",
+ "h": "h",
+ "QE": "Q",
+ "QE-DEC": "Q-DEC",
+ "QE-JAN": "Q-JAN",
+ "QE-FEB": "Q-FEB",
+ "QE-MAR": "Q-MAR",
+ "QE-APR": "Q-APR",
+ "QE-MAY": "Q-MAY",
+ "QE-JUN": "Q-JUN",
+ "QE-JUL": "Q-JUL",
+ "QE-AUG": "Q-AUG",
+ "QE-SEP": "Q-SEP",
+ "QE-OCT": "Q-OCT",
+ "QE-NOV": "Q-NOV",
+ "YE": "Y",
+ "YE-DEC": "Y-DEC",
+ "YE-JAN": "Y-JAN",
+ "YE-FEB": "Y-FEB",
+ "YE-MAR": "Y-MAR",
+ "YE-APR": "Y-APR",
+ "YE-MAY": "Y-MAY",
+ "YE-JUN": "Y-JUN",
+ "YE-JUL": "Y-JUL",
+ "YE-AUG": "Y-AUG",
+ "YE-SEP": "Y-SEP",
+ "YE-OCT": "Y-OCT",
+ "YE-NOV": "Y-NOV",
+ "W": "W",
+ "ME": "M",
+ "Y": "Y",
+ "BYE": "Y",
+ "BYE-DEC": "Y-DEC",
+ "BYE-JAN": "Y-JAN",
+ "BYE-FEB": "Y-FEB",
+ "BYE-MAR": "Y-MAR",
+ "BYE-APR": "Y-APR",
+ "BYE-MAY": "Y-MAY",
+ "BYE-JUN": "Y-JUN",
+ "BYE-JUL": "Y-JUL",
+ "BYE-AUG": "Y-AUG",
+ "BYE-SEP": "Y-SEP",
+ "BYE-OCT": "Y-OCT",
+ "BYE-NOV": "Y-NOV",
+ "YS": "Y",
+ "BYS": "Y",
+}
+cdef dict c_OFFSET_DEPR_FREQSTR = {
+ "M": "ME",
+ "Q": "QE",
+ "Q-DEC": "QE-DEC",
+ "Q-JAN": "QE-JAN",
+ "Q-FEB": "QE-FEB",
+ "Q-MAR": "QE-MAR",
+ "Q-APR": "QE-APR",
+ "Q-MAY": "QE-MAY",
+ "Q-JUN": "QE-JUN",
+ "Q-JUL": "QE-JUL",
+ "Q-AUG": "QE-AUG",
+ "Q-SEP": "QE-SEP",
+ "Q-OCT": "QE-OCT",
+ "Q-NOV": "QE-NOV",
+ "Y": "YE",
+ "Y-DEC": "YE-DEC",
+ "Y-JAN": "YE-JAN",
+ "Y-FEB": "YE-FEB",
+ "Y-MAR": "YE-MAR",
+ "Y-APR": "YE-APR",
+ "Y-MAY": "YE-MAY",
+ "Y-JUN": "YE-JUN",
+ "Y-JUL": "YE-JUL",
+ "Y-AUG": "YE-AUG",
+ "Y-SEP": "YE-SEP",
+ "Y-OCT": "YE-OCT",
+ "Y-NOV": "YE-NOV",
+ "A": "YE",
+ "A-DEC": "YE-DEC",
+ "A-JAN": "YE-JAN",
+ "A-FEB": "YE-FEB",
+ "A-MAR": "YE-MAR",
+ "A-APR": "YE-APR",
+ "A-MAY": "YE-MAY",
+ "A-JUN": "YE-JUN",
+ "A-JUL": "YE-JUL",
+ "A-AUG": "YE-AUG",
+ "A-SEP": "YE-SEP",
+ "A-OCT": "YE-OCT",
+ "A-NOV": "YE-NOV",
+ "BY": "BYE",
+ "BY-DEC": "BYE-DEC",
+ "BY-JAN": "BYE-JAN",
+ "BY-FEB": "BYE-FEB",
+ "BY-MAR": "BYE-MAR",
+ "BY-APR": "BYE-APR",
+ "BY-MAY": "BYE-MAY",
+ "BY-JUN": "BYE-JUN",
+ "BY-JUL": "BYE-JUL",
+ "BY-AUG": "BYE-AUG",
+ "BY-SEP": "BYE-SEP",
+ "BY-OCT": "BYE-OCT",
+ "BY-NOV": "BYE-NOV",
+ "BA": "BYE",
+ "BA-DEC": "BYE-DEC",
+ "BA-JAN": "BYE-JAN",
+ "BA-FEB": "BYE-FEB",
+ "BA-MAR": "BYE-MAR",
+ "BA-APR": "BYE-APR",
+ "BA-MAY": "BYE-MAY",
+ "BA-JUN": "BYE-JUN",
+ "BA-JUL": "BYE-JUL",
+ "BA-AUG": "BYE-AUG",
+ "BA-SEP": "BYE-SEP",
+ "BA-OCT": "BYE-OCT",
+ "BA-NOV": "BYE-NOV",
+ "BM": "BME",
+ "CBM": "CBME",
+ "SM": "SME",
+ "BQ": "BQE",
+ "BQ-DEC": "BQE-DEC",
+ "BQ-JAN": "BQE-JAN",
+ "BQ-FEB": "BQE-FEB",
+ "BQ-MAR": "BQE-MAR",
+ "BQ-APR": "BQE-APR",
+ "BQ-MAY": "BQE-MAY",
+ "BQ-JUN": "BQE-JUN",
+ "BQ-JUL": "BQE-JUL",
+ "BQ-AUG": "BQE-AUG",
+ "BQ-SEP": "BQE-SEP",
+ "BQ-OCT": "BQE-OCT",
+ "BQ-NOV": "BQE-NOV",
+}
+cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR
+cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = {
+ v: k for k, v in c_OFFSET_DEPR_FREQSTR.items()
+}
+
+cpdef freq_to_period_freqstr(freq_n, freq_name):
+ if freq_n == 1:
+ freqstr = f"""{c_OFFSET_TO_PERIOD_FREQSTR.get(
+ freq_name, freq_name)}"""
+ else:
+ freqstr = f"""{freq_n}{c_OFFSET_TO_PERIOD_FREQSTR.get(
+ freq_name, freq_name)}"""
+ return freqstr
+
+# Map deprecated resolution abbreviations to correct resolution abbreviations
+cdef dict c_DEPR_ABBREVS = {
+ "A": "Y",
+ "a": "Y",
+ "A-DEC": "Y-DEC",
+ "A-JAN": "Y-JAN",
+ "A-FEB": "Y-FEB",
+ "A-MAR": "Y-MAR",
+ "A-APR": "Y-APR",
+ "A-MAY": "Y-MAY",
+ "A-JUN": "Y-JUN",
+ "A-JUL": "Y-JUL",
+ "A-AUG": "Y-AUG",
+ "A-SEP": "Y-SEP",
+ "A-OCT": "Y-OCT",
+ "A-NOV": "Y-NOV",
+ "BA": "BY",
+ "BA-DEC": "BY-DEC",
+ "BA-JAN": "BY-JAN",
+ "BA-FEB": "BY-FEB",
+ "BA-MAR": "BY-MAR",
+ "BA-APR": "BY-APR",
+ "BA-MAY": "BY-MAY",
+ "BA-JUN": "BY-JUN",
+ "BA-JUL": "BY-JUL",
+ "BA-AUG": "BY-AUG",
+ "BA-SEP": "BY-SEP",
+ "BA-OCT": "BY-OCT",
+ "BA-NOV": "BY-NOV",
+ "AS": "YS",
+ "AS-DEC": "YS-DEC",
+ "AS-JAN": "YS-JAN",
+ "AS-FEB": "YS-FEB",
+ "AS-MAR": "YS-MAR",
+ "AS-APR": "YS-APR",
+ "AS-MAY": "YS-MAY",
+ "AS-JUN": "YS-JUN",
+ "AS-JUL": "YS-JUL",
+ "AS-AUG": "YS-AUG",
+ "AS-SEP": "YS-SEP",
+ "AS-OCT": "YS-OCT",
+ "AS-NOV": "YS-NOV",
+ "BAS": "BYS",
+ "BAS-DEC": "BYS-DEC",
+ "BAS-JAN": "BYS-JAN",
+ "BAS-FEB": "BYS-FEB",
+ "BAS-MAR": "BYS-MAR",
+ "BAS-APR": "BYS-APR",
+ "BAS-MAY": "BYS-MAY",
+ "BAS-JUN": "BYS-JUN",
+ "BAS-JUL": "BYS-JUL",
+ "BAS-AUG": "BYS-AUG",
+ "BAS-SEP": "BYS-SEP",
+ "BAS-OCT": "BYS-OCT",
+ "BAS-NOV": "BYS-NOV",
+ "H": "h",
+ "BH": "bh",
+ "CBH": "cbh",
+ "T": "min",
+ "t": "min",
+ "S": "s",
+ "L": "ms",
+ "l": "ms",
+ "U": "us",
+ "u": "us",
+ "N": "ns",
+ "n": "ns",
+}
+
class FreqGroup(Enum):
# Mirrors c_FreqGroup in the .pxd file
@@ -228,7 +457,7 @@ class Resolution(Enum):
@property
def attr_abbrev(self) -> str:
# string that we can pass to to_offset
- return _attrname_to_abbrevs[self.attrname]
+ return attrname_to_abbrevs[self.attrname]
@property
def attrname(self) -> str:
@@ -266,13 +495,25 @@ class Resolution(Enum):
Examples
--------
- >>> Resolution.get_reso_from_freqstr('H')
+ >>> Resolution.get_reso_from_freqstr('h')
- >>> Resolution.get_reso_from_freqstr('H') == Resolution.RESO_HR
+ >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR
True
"""
+ cdef:
+ str abbrev
try:
+ if freq in c_DEPR_ABBREVS:
+ abbrev = c_DEPR_ABBREVS[freq]
+ warnings.warn(
+ f"\'{freq}\' is deprecated and will be removed in a future "
+ f"version. Please use \'{abbrev}\' "
+ "instead of \'{freq}\'.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
+ freq = abbrev
attr_name = _abbrev_to_attrnames[freq]
except KeyError:
# For quarterly and yearly resolutions, we need to chop off
@@ -283,6 +524,16 @@ class Resolution(Enum):
if split_freq[1] not in _month_names:
# i.e. we want e.g. "Q-DEC", not "Q-INVALID"
raise
+ if split_freq[0] in c_DEPR_ABBREVS:
+ abbrev = c_DEPR_ABBREVS[split_freq[0]]
+ warnings.warn(
+ f"\'{split_freq[0]}\' is deprecated and will be removed in a "
+ f"future version. Please use \'{abbrev}\' "
+ f"instead of \'{split_freq[0]}\'.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
+ split_freq[0] = abbrev
attr_name = _abbrev_to_attrnames[split_freq[0]]
return cls.from_attrname(attr_name)
@@ -308,7 +559,7 @@ class NpyDatetimeUnit(Enum):
NPY_FR_GENERIC = NPY_DATETIMEUNIT.NPY_FR_GENERIC
-cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso):
+cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso):
# If we have an unsupported reso, return the nearest supported reso.
if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# TODO: or raise ValueError? trying this gives unraisable errors, but
@@ -321,7 +572,7 @@ cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso):
return reso
-cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso):
+cdef bint is_supported_unit(NPY_DATETIMEUNIT reso):
return (
reso == NPY_DATETIMEUNIT.NPY_FR_ns
or reso == NPY_DATETIMEUNIT.NPY_FR_us
@@ -330,7 +581,7 @@ cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso):
)
-cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
+cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# generic -> default to nanoseconds
return "ns"
@@ -425,7 +676,6 @@ cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil:
return NPY_DATETIMEUNIT.NPY_FR_D
-# TODO: use in _matplotlib.converter?
cpdef int64_t periods_per_day(
NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns
) except? -1:
diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx
index ad37add17967d..ff4fb4d635d17 100644
--- a/pandas/_libs/tslibs/fields.pyx
+++ b/pandas/_libs/tslibs/fields.pyx
@@ -253,8 +253,8 @@ def get_start_end_field(
# month of year. Other offsets use month, startingMonth as ending
# month of year.
- if (freqstr[0:2] in ["MS", "QS", "AS"]) or (
- freqstr[1:3] in ["MS", "QS", "AS"]):
+ if (freqstr[0:2] in ["MS", "QS", "YS"]) or (
+ freqstr[1:3] in ["MS", "QS", "YS"]):
end_month = 12 if month_kw == 1 else month_kw - 1
start_month = month_kw
else:
@@ -746,7 +746,31 @@ cdef ndarray[int64_t] _ceil_int64(const int64_t[:] values, int64_t unit):
cdef ndarray[int64_t] _rounddown_int64(values, int64_t unit):
- return _ceil_int64(values - unit // 2, unit)
+ cdef:
+ Py_ssize_t i, n = len(values)
+ ndarray[int64_t] result = np.empty(n, dtype="i8")
+ int64_t res, value, remainder, half
+
+ half = unit // 2
+
+ with cython.overflowcheck(True):
+ for i in range(n):
+ value = values[i]
+
+ if value == NPY_NAT:
+ res = NPY_NAT
+ else:
+ # This adjustment is the only difference between rounddown_int64
+ # and _ceil_int64
+ value = value - half
+ remainder = value % unit
+ if remainder == 0:
+ res = value
+ else:
+ res = value + (unit - remainder)
+
+ result[i] = res
+ return result
cdef ndarray[int64_t] _roundup_int64(values, int64_t unit):
diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build
index 14d2eef46da20..85410f771233f 100644
--- a/pandas/_libs/tslibs/meson.build
+++ b/pandas/_libs/tslibs/meson.build
@@ -19,11 +19,20 @@ tslibs_sources = {
'vectorized': {'sources': ['vectorized.pyx']},
}
+cython_args = [
+ '--include-dir',
+ meson.current_build_dir(),
+ '-X always_allow_keywords=true'
+]
+if get_option('buildtype') == 'debug'
+ cython_args += ['--gdb']
+endif
+
foreach ext_name, ext_dict : tslibs_sources
py.extension_module(
ext_name,
ext_dict.get('sources'),
- cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'],
+ cython_args: cython_args,
include_directories: [inc_np, inc_pd],
dependencies: ext_dict.get('deps', ''),
subdir: 'pandas/_libs/tslibs',
@@ -31,6 +40,28 @@ foreach ext_name, ext_dict : tslibs_sources
)
endforeach
-py.install_sources('__init__.py',
- pure: false,
- subdir: 'pandas/_libs/tslibs')
+sources_to_install = [
+ '__init__.py',
+ 'ccalendar.pyi',
+ 'conversion.pyi',
+ 'dtypes.pyi',
+ 'fields.pyi',
+ 'nattype.pyi',
+ 'np_datetime.pyi',
+ 'offsets.pyi',
+ 'parsing.pyi',
+ 'period.pyi',
+ 'strptime.pyi',
+ 'timedeltas.pyi',
+ 'timestamps.pyi',
+ 'timezones.pyi',
+ 'tzconversion.pyi',
+ 'vectorized.pyi'
+]
+
+foreach source: sources_to_install
+ py.install_sources(
+ source,
+ subdir: 'pandas/_libs/tslibs'
+ )
+endforeach
diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi
index 437b5ab6676c8..bd86a6fdc2174 100644
--- a/pandas/_libs/tslibs/nattype.pyi
+++ b/pandas/_libs/tslibs/nattype.pyi
@@ -8,6 +8,7 @@ import typing
import numpy as np
from pandas._libs.tslibs.period import Period
+from pandas._typing import Self
NaT: NaTType
iNaT: int
@@ -132,4 +133,9 @@ class NaTType:
__le__: _NatComparison
__gt__: _NatComparison
__ge__: _NatComparison
+ def __sub__(self, other: Self | timedelta | datetime) -> Self: ...
+ def __rsub__(self, other: Self | timedelta | datetime) -> Self: ...
+ def __add__(self, other: Self | timedelta | datetime) -> Self: ...
+ def __radd__(self, other: Self | timedelta | datetime) -> Self: ...
+ def __hash__(self) -> int: ...
def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ...
diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
index 7d75fa3114d2b..65db0e05f859c 100644
--- a/pandas/_libs/tslibs/nattype.pyx
+++ b/pandas/_libs/tslibs/nattype.pyx
@@ -21,10 +21,6 @@ from numpy cimport int64_t
cnp.import_array()
cimport pandas._libs.tslibs.util as util
-from pandas._libs.tslibs.np_datetime cimport (
- get_datetime64_value,
- get_timedelta64_value,
-)
# ----------------------------------------------------------------------
# Constants
@@ -67,7 +63,7 @@ def _make_error_func(func_name: str, cls):
cdef _nat_divide_op(self, other):
- if PyDelta_Check(other) or util.is_timedelta64_object(other) or other is c_NaT:
+ if PyDelta_Check(other) or cnp.is_timedelta64_object(other) or other is c_NaT:
return np.nan
if util.is_integer_object(other) or util.is_float_object(other):
return c_NaT
@@ -95,11 +91,11 @@ cdef class _NaT(datetime):
__array_priority__ = 100
def __richcmp__(_NaT self, object other, int op):
- if util.is_datetime64_object(other) or PyDateTime_Check(other):
+ if cnp.is_datetime64_object(other) or PyDateTime_Check(other):
# We treat NaT as datetime-like for this comparison
return op == Py_NE
- elif util.is_timedelta64_object(other) or PyDelta_Check(other):
+ elif cnp.is_timedelta64_object(other) or PyDelta_Check(other):
# We treat NaT as timedelta-like for this comparison
return op == Py_NE
@@ -128,16 +124,11 @@ cdef class _NaT(datetime):
return NotImplemented
def __add__(self, other):
- if self is not c_NaT:
- # TODO(cython3): remove this it moved to __radd__
- # cython __radd__ semantics
- self, other = other, self
-
if PyDateTime_Check(other):
return c_NaT
elif PyDelta_Check(other):
return c_NaT
- elif util.is_datetime64_object(other) or util.is_timedelta64_object(other):
+ elif cnp.is_datetime64_object(other) or cnp.is_timedelta64_object(other):
return c_NaT
elif util.is_integer_object(other):
@@ -162,20 +153,12 @@ cdef class _NaT(datetime):
def __sub__(self, other):
# Duplicate some logic from _Timestamp.__sub__ to avoid needing
# to subclass; allows us to @final(_Timestamp.__sub__)
- cdef:
- bint is_rsub = False
-
- if self is not c_NaT:
- # cython __rsub__ semantics
- # TODO(cython3): remove __rsub__ logic from here
- self, other = other, self
- is_rsub = True
if PyDateTime_Check(other):
return c_NaT
elif PyDelta_Check(other):
return c_NaT
- elif util.is_datetime64_object(other) or util.is_timedelta64_object(other):
+ elif cnp.is_datetime64_object(other) or cnp.is_timedelta64_object(other):
return c_NaT
elif util.is_integer_object(other):
@@ -184,19 +167,9 @@ cdef class _NaT(datetime):
elif util.is_array(other):
if other.dtype.kind == "m":
- if not is_rsub:
- # NaT - timedelta64 we treat NaT as datetime64, so result
- # is datetime64
- result = np.empty(other.shape, dtype="datetime64[ns]")
- result.fill("NaT")
- return result
-
- # __rsub__ logic here
- # TODO(cython3): remove this, move above code out of
- # ``if not is_rsub`` block
- # timedelta64 - NaT we have to treat NaT as timedelta64
- # for this to be meaningful, and the result is timedelta64
- result = np.empty(other.shape, dtype="timedelta64[ns]")
+ # NaT - timedelta64 we treat NaT as datetime64, so result
+ # is datetime64
+ result = np.empty(other.shape, dtype="datetime64[ns]")
result.fill("NaT")
return result
@@ -1000,26 +973,26 @@ timedelta}, default 'raise'
A timestamp can be rounded using multiple frequency units:
- >>> ts.round(freq='H') # hour
+ >>> ts.round(freq='h') # hour
Timestamp('2020-03-14 16:00:00')
- >>> ts.round(freq='T') # minute
+ >>> ts.round(freq='min') # minute
Timestamp('2020-03-14 15:33:00')
- >>> ts.round(freq='S') # seconds
+ >>> ts.round(freq='s') # seconds
Timestamp('2020-03-14 15:32:52')
- >>> ts.round(freq='L') # milliseconds
+ >>> ts.round(freq='ms') # milliseconds
Timestamp('2020-03-14 15:32:52.193000')
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
+ ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes):
- >>> ts.round(freq='5T')
+ >>> ts.round(freq='5min')
Timestamp('2020-03-14 15:35:00')
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
+ or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes):
- >>> ts.round(freq='1H30T')
+ >>> ts.round(freq='1h30min')
Timestamp('2020-03-14 15:00:00')
Analogous for ``pd.NaT``:
@@ -1032,10 +1005,10 @@ timedelta}, default 'raise'
>>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam")
- >>> ts_tz.round("H", ambiguous=False)
+ >>> ts_tz.round("h", ambiguous=False)
Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
- >>> ts_tz.round("H", ambiguous=True)
+ >>> ts_tz.round("h", ambiguous=True)
Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
""",
)
@@ -1089,26 +1062,26 @@ timedelta}, default 'raise'
A timestamp can be floored using multiple frequency units:
- >>> ts.floor(freq='H') # hour
+ >>> ts.floor(freq='h') # hour
Timestamp('2020-03-14 15:00:00')
- >>> ts.floor(freq='T') # minute
+ >>> ts.floor(freq='min') # minute
Timestamp('2020-03-14 15:32:00')
- >>> ts.floor(freq='S') # seconds
+ >>> ts.floor(freq='s') # seconds
Timestamp('2020-03-14 15:32:52')
- >>> ts.floor(freq='N') # nanoseconds
+ >>> ts.floor(freq='ns') # nanoseconds
Timestamp('2020-03-14 15:32:52.192548651')
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
+ ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes):
- >>> ts.floor(freq='5T')
+ >>> ts.floor(freq='5min')
Timestamp('2020-03-14 15:30:00')
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
+ or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes):
- >>> ts.floor(freq='1H30T')
+ >>> ts.floor(freq='1h30min')
Timestamp('2020-03-14 15:00:00')
Analogous for ``pd.NaT``:
@@ -1121,10 +1094,10 @@ timedelta}, default 'raise'
>>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam")
- >>> ts_tz.floor("2H", ambiguous=False)
+ >>> ts_tz.floor("2h", ambiguous=False)
Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
- >>> ts_tz.floor("2H", ambiguous=True)
+ >>> ts_tz.floor("2h", ambiguous=True)
Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
""",
)
@@ -1178,26 +1151,26 @@ timedelta}, default 'raise'
A timestamp can be ceiled using multiple frequency units:
- >>> ts.ceil(freq='H') # hour
+ >>> ts.ceil(freq='h') # hour
Timestamp('2020-03-14 16:00:00')
- >>> ts.ceil(freq='T') # minute
+ >>> ts.ceil(freq='min') # minute
Timestamp('2020-03-14 15:33:00')
- >>> ts.ceil(freq='S') # seconds
+ >>> ts.ceil(freq='s') # seconds
Timestamp('2020-03-14 15:32:53')
- >>> ts.ceil(freq='U') # microseconds
+ >>> ts.ceil(freq='us') # microseconds
Timestamp('2020-03-14 15:32:52.192549')
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
+ ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes):
- >>> ts.ceil(freq='5T')
+ >>> ts.ceil(freq='5min')
Timestamp('2020-03-14 15:35:00')
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
+ or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes):
- >>> ts.ceil(freq='1H30T')
+ >>> ts.ceil(freq='1h30min')
Timestamp('2020-03-14 16:30:00')
Analogous for ``pd.NaT``:
@@ -1210,10 +1183,10 @@ timedelta}, default 'raise'
>>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam")
- >>> ts_tz.ceil("H", ambiguous=False)
+ >>> ts_tz.ceil("h", ambiguous=False)
Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
- >>> ts_tz.ceil("H", ambiguous=True)
+ >>> ts_tz.ceil("h", ambiguous=True)
Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
""",
)
@@ -1438,8 +1411,8 @@ cdef bint is_dt64nat(object val):
"""
Is this a np.datetime64 object np.datetime64("NaT").
"""
- if util.is_datetime64_object(val):
- return get_datetime64_value(val) == NPY_NAT
+ if cnp.is_datetime64_object(val):
+ return cnp.get_datetime64_value(val) == NPY_NAT
return False
@@ -1447,6 +1420,6 @@ cdef bint is_td64nat(object val):
"""
Is this a np.timedelta64 object np.timedelta64("NaT").
"""
- if util.is_timedelta64_object(val):
- return get_timedelta64_value(val) == NPY_NAT
+ if cnp.is_timedelta64_object(val):
+ return cnp.get_timedelta64_value(val) == NPY_NAT
return False
diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd
index 60532174e8bdc..cb2658d343772 100644
--- a/pandas/_libs/tslibs/np_datetime.pxd
+++ b/pandas/_libs/tslibs/np_datetime.pxd
@@ -6,30 +6,12 @@ from cpython.datetime cimport (
from numpy cimport (
int32_t,
int64_t,
+ npy_datetime,
+ npy_timedelta,
)
# TODO(cython3): most of these can be cimported directly from numpy
-cdef extern from "numpy/ndarrayobject.h":
- ctypedef int64_t npy_timedelta
- ctypedef int64_t npy_datetime
-
-cdef extern from "numpy/ndarraytypes.h":
- ctypedef struct PyArray_DatetimeMetaData:
- NPY_DATETIMEUNIT base
- int64_t num
-
-cdef extern from "numpy/arrayscalars.h":
- ctypedef struct PyDatetimeScalarObject:
- # PyObject_HEAD
- npy_datetime obval
- PyArray_DatetimeMetaData obmeta
-
- ctypedef struct PyTimedeltaScalarObject:
- # PyObject_HEAD
- npy_timedelta obval
- PyArray_DatetimeMetaData obmeta
-
cdef extern from "numpy/ndarraytypes.h":
ctypedef struct npy_datetimestruct:
int64_t year
@@ -65,7 +47,7 @@ cdef extern from "pandas/datetime/pd_datetime.h":
npy_datetimestruct *result) nogil
npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr,
- npy_datetimestruct *d) nogil
+ npy_datetimestruct *d) except? -1 nogil
void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
NPY_DATETIMEUNIT fr,
@@ -85,19 +67,19 @@ cdef inline void import_pandas_datetime() noexcept:
cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1
+cdef str dts_to_iso_string(npy_datetimestruct *dts)
+
cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=?)
cdef int64_t pydatetime_to_dt64(
datetime val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=?
-)
+) except? -1
cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts) noexcept
cdef int64_t pydate_to_dt64(
date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=?
-)
+) except? -1
cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept
-cdef npy_datetime get_datetime64_value(object obj) noexcept nogil
-cdef npy_timedelta get_timedelta64_value(object obj) noexcept nogil
cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil
cdef int string_to_dts(
@@ -136,3 +118,5 @@ cdef int64_t convert_reso(
NPY_DATETIMEUNIT to_reso,
bint round_ok,
) except? -1
+
+cpdef cnp.ndarray add_overflowsafe(cnp.ndarray left, cnp.ndarray right)
diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi
index 0cb0e3b0237d7..00ef35c50e532 100644
--- a/pandas/_libs/tslibs/np_datetime.pyi
+++ b/pandas/_libs/tslibs/np_datetime.pyi
@@ -9,7 +9,7 @@ class OutOfBoundsTimedelta(ValueError): ...
def py_get_unit_from_dtype(dtype: np.dtype): ...
def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ...
def astype_overflowsafe(
- arr: np.ndarray,
+ values: np.ndarray,
dtype: np.dtype,
copy: bool = ...,
round_ok: bool = ...,
@@ -19,3 +19,9 @@ def is_unitless(dtype: np.dtype) -> bool: ...
def compare_mismatched_resolutions(
left: np.ndarray, right: np.ndarray, op
) -> npt.NDArray[np.bool_]: ...
+def add_overflowsafe(
+ left: npt.NDArray[np.int64],
+ right: npt.NDArray[np.int64],
+) -> npt.NDArray[np.int64]: ...
+def get_supported_dtype(dtype: np.dtype) -> np.dtype: ...
+def is_supported_dtype(dtype: np.dtype) -> bool: ...
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
index 7b2ee68c73ad2..aa01a05d0d932 100644
--- a/pandas/_libs/tslibs/np_datetime.pyx
+++ b/pandas/_libs/tslibs/np_datetime.pyx
@@ -18,21 +18,32 @@ from cpython.object cimport (
Py_LT,
Py_NE,
)
+from libc.stdint cimport INT64_MAX
import_datetime()
PandasDateTime_IMPORT
+import operator
+
import numpy as np
cimport numpy as cnp
cnp.import_array()
from numpy cimport (
+ PyArray_DatetimeMetaData,
+ PyDatetimeScalarObject,
int64_t,
ndarray,
uint8_t,
)
+from pandas._libs.tslibs.dtypes cimport (
+ get_supported_reso,
+ is_supported_unit,
+ npy_unit_to_abbrev,
+ npy_unit_to_attrname,
+)
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
@@ -59,22 +70,6 @@ cdef extern from "pandas/datetime/pd_datetime.h":
# ----------------------------------------------------------------------
# numpy object inspection
-cdef npy_datetime get_datetime64_value(object obj) noexcept nogil:
- """
- returns the int64 value underlying scalar numpy datetime64 object
-
- Note that to interpret this as a datetime, the corresponding unit is
- also needed. That can be found using `get_datetime64_unit`.
- """
- return (obj).obval
-
-
-cdef npy_timedelta get_timedelta64_value(object obj) noexcept nogil:
- """
- returns the int64 value underlying scalar numpy timedelta64 object
- """
- return (