diff --git a/.circleci/config.yml b/.circleci/config.yml index 6f134c9a7a7bd..463667446ed42 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,46 +1,64 @@ version: 2.1 jobs: - test-arm: + test-linux-arm: machine: image: default resource_class: arm.large environment: - ENV_FILE: ci/deps/circle-310-arm64.yaml + ENV_FILE: ci/deps/circle-311-arm64.yaml PYTEST_WORKERS: auto PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" PYTEST_TARGET: "pandas" PANDAS_CI: "1" steps: - checkout - - run: .circleci/setup_env.sh - - run: > - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD - ci/run_tests.sh - linux-musl: + - run: + name: Install Environment and Run Tests + shell: /bin/bash -exuo pipefail + command: | + MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" + wget -q $MAMBA_URL -O minimamba.sh + chmod +x minimamba.sh + MAMBA_DIR="$HOME/miniconda3" + rm -rf $MAMBA_DIR + ./minimamba.sh -b -p $MAMBA_DIR + export PATH=$MAMBA_DIR/bin:$PATH + conda info -a + conda env create -q -n pandas-dev -f $ENV_FILE + conda list -n pandas-dev + source activate pandas-dev + if pip show pandas 1>/dev/null; then + pip uninstall -y pandas + fi + python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" + PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH + ci/run_tests.sh + test-linux-musl: docker: - image: quay.io/pypa/musllinux_1_1_aarch64 resource_class: arm.large steps: # Install pkgs first to have git in the image # (needed for checkout) - - run: | - apk update - apk add git - apk add musl-locales + - run: + name: Install System Packages + command: | + apk update + apk add git + apk add musl-locales - checkout - - run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" - python -m pip list --no-cache-dir - - run: | - . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + - run: + name: Install Environment and Run Tests + command: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip list --no-cache-dir + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml build-aarch64: parameters: cibw-build: @@ -71,7 +89,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.15.0 + pip3 install cibuildwheel==2.18.1 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: @@ -81,7 +99,7 @@ jobs: name: Install Anaconda Client & Upload Wheels command: | echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" + MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" echo "Downloading $MAMBA_URL" wget -q $MAMBA_URL -O minimamba.sh chmod +x minimamba.sh @@ -107,14 +125,14 @@ workflows: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - test-arm + - test-linux-arm test-musl: # Don't run trigger this one when scheduled pipeline runs when: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - linux-musl + - test-linux-musl build-wheels: jobs: - build-aarch64: diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh deleted file mode 100755 index eef4db1191a9a..0000000000000 --- a/.circleci/setup_env.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -e - -echo "Install Mambaforge" -MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" -echo "Downloading $MAMBA_URL" -wget -q $MAMBA_URL -O minimamba.sh -chmod +x minimamba.sh - -MAMBA_DIR="$HOME/miniconda3" -rm -rf $MAMBA_DIR -./minimamba.sh -b -p $MAMBA_DIR - -export PATH=$MAMBA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -mamba install -y -c conda-forge -n base pip setuptools - -echo "conda info -a" -conda info -a - -echo "conda list (root environment)" -conda list - -echo -# Clean up any left-over from a previous build -mamba env remove -n pandas-dev -echo "mamba env update --file=${ENV_FILE}" -# See https://github.com/mamba-org/mamba/issues/633 -mamba create -q -n pandas-dev -time mamba env update -n pandas-dev --file="${ENV_FILE}" - -echo "conda list -n pandas-dev" -conda list -n pandas-dev - -echo "activate pandas-dev" -source activate pandas-dev - -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - -if pip show pandas 1>/dev/null; then - echo - echo "remove any installed pandas package w/o removing anything else" - pip uninstall -y pandas -fi - -echo "Install pandas" -python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" - -echo "done" diff --git a/.github/workflows/cache-cleanup-weekly.yml b/.github/workflows/cache-cleanup-daily.yml similarity index 90% rename from .github/workflows/cache-cleanup-weekly.yml rename to .github/workflows/cache-cleanup-daily.yml index 6da31f7354457..8eadfb2ccd2a9 100644 --- a/.github/workflows/cache-cleanup-weekly.yml +++ b/.github/workflows/cache-cleanup-daily.yml @@ -1,8 +1,8 @@ -name: Purge caches once a week +name: Purge caches daily on: schedule: - # 4:10 UTC on Sunday - - cron: "10 4 * * 0" + # 4:10 UTC daily + - cron: "10 4 * * *" jobs: cleanup: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index f93950224eaae..1b88d4d90d3e1 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -314,7 +314,7 @@ jobs: timeout-minutes: 90 concurrency: - #https://github.community/t/concurrecy-not-work-for-push/183068/7 + # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev cancel-in-progress: true @@ -346,3 +346,62 @@ jobs: - name: Run Tests uses: ./.github/actions/run-tests + + emscripten: + # Note: the Python version, Emscripten toolchain version are determined + # by the Pyodide version. The appropriate versions can be found in the + # Pyodide repodata.json "info" field, or in the Makefile.envs file: + # https://github.com/pyodide/pyodide/blob/stable/Makefile.envs#L2 + # The Node.js version can be determined via Pyodide: + # https://pyodide.org/en/stable/usage/index.html#node-js + name: Pyodide build + runs-on: ubuntu-22.04 + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-wasm + cancel-in-progress: true + steps: + - name: Checkout pandas Repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python for Pyodide + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: '3.11.3' + + - name: Set up Emscripten toolchain + uses: mymindstorm/setup-emsdk@v14 + with: + version: '3.1.46' + actions-cache-folder: emsdk-cache + + - name: Install pyodide-build + run: pip install "pyodide-build==0.25.1" + + - name: Build pandas for Pyodide + run: | + pyodide build + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '18' + + - name: Set up Pyodide virtual environment + run: | + pyodide venv .venv-pyodide + source .venv-pyodide/bin/activate + pip install dist/*.whl + + - name: Test pandas for Pyodide + env: + PANDAS_CI: 1 + run: | + source .venv-pyodide/bin/activate + pip install pytest hypothesis + # do not import pandas from the checked out repo + cd .. + python -c 'import pandas as pd; pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db"])' diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4bd9068e91b67..4b34d2b21495b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -140,7 +140,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.19.1 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43fa49ed2b6bc..bf88500b10524 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.4 + rev: v0.4.7 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -40,18 +40,18 @@ repos: pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.0 + rev: v0.16.2 hooks: - id: cython-lint - id: double-quote-cython-strings - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: - id: check-case-conflict - id: check-toml @@ -90,8 +90,9 @@ repos: rev: v0.9.1 hooks: - id: sphinx-lint + args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.2 + rev: v18.1.5 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE deleted file mode 100644 index 8405e89a0b120..0000000000000 --- a/LICENSES/XARRAY_LICENSE +++ /dev/null @@ -1,191 +0,0 @@ -Apache License -Version 2.0, January 2004 -http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - -"License" shall mean the terms and conditions for use, reproduction, and -distribution as defined by Sections 1 through 9 of this document. - -"Licensor" shall mean the copyright owner or entity authorized by the copyright -owner that is granting the License. - -"Legal Entity" shall mean the union of the acting entity and all other entities -that control, are controlled by, or are under common control with that entity. -For the purposes of this definition, "control" means (i) the power, direct or -indirect, to cause the direction or management of such entity, whether by -contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the -outstanding shares, or (iii) beneficial ownership of such entity. - -"You" (or "Your") shall mean an individual or Legal Entity exercising -permissions granted by this License. - -"Source" form shall mean the preferred form for making modifications, including -but not limited to software source code, documentation source, and configuration -files. - -"Object" form shall mean any form resulting from mechanical transformation or -translation of a Source form, including but not limited to compiled object code, -generated documentation, and conversions to other media types. - -"Work" shall mean the work of authorship, whether in Source or Object form, made -available under the License, as indicated by a copyright notice that is included -in or attached to the work (an example is provided in the Appendix below). - -"Derivative Works" shall mean any work, whether in Source or Object form, that -is based on (or derived from) the Work and for which the editorial revisions, -annotations, elaborations, or other modifications represent, as a whole, an -original work of authorship. For the purposes of this License, Derivative Works -shall not include works that remain separable from, or merely link (or bind by -name) to the interfaces of, the Work and Derivative Works thereof. - -"Contribution" shall mean any work of authorship, including the original version -of the Work and any modifications or additions to that Work or Derivative Works -thereof, that is intentionally submitted to Licensor for inclusion in the Work -by the copyright owner or by an individual or Legal Entity authorized to submit -on behalf of the copyright owner. For the purposes of this definition, -"submitted" means any form of electronic, verbal, or written communication sent -to the Licensor or its representatives, including but not limited to -communication on electronic mailing lists, source code control systems, and -issue tracking systems that are managed by, or on behalf of, the Licensor for -the purpose of discussing and improving the Work, but excluding communication -that is conspicuously marked or otherwise designated in writing by the copyright -owner as "Not a Contribution." - -"Contributor" shall mean Licensor and any individual or Legal Entity on behalf -of whom a Contribution has been received by Licensor and subsequently -incorporated within the Work. - -2. Grant of Copyright License. - -Subject to the terms and conditions of this License, each Contributor hereby -grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, -irrevocable copyright license to reproduce, prepare Derivative Works of, -publicly display, publicly perform, sublicense, and distribute the Work and such -Derivative Works in Source or Object form. - -3. Grant of Patent License. - -Subject to the terms and conditions of this License, each Contributor hereby -grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, -irrevocable (except as stated in this section) patent license to make, have -made, use, offer to sell, sell, import, and otherwise transfer the Work, where -such license applies only to those patent claims licensable by such Contributor -that are necessarily infringed by their Contribution(s) alone or by combination -of their Contribution(s) with the Work to which such Contribution(s) was -submitted. If You institute patent litigation against any entity (including a -cross-claim or counterclaim in a lawsuit) alleging that the Work or a -Contribution incorporated within the Work constitutes direct or contributory -patent infringement, then any patent licenses granted to You under this License -for that Work shall terminate as of the date such litigation is filed. - -4. Redistribution. - -You may reproduce and distribute copies of the Work or Derivative Works thereof -in any medium, with or without modifications, and in Source or Object form, -provided that You meet the following conditions: - -You must give any other recipients of the Work or Derivative Works a copy of -this License; and -You must cause any modified files to carry prominent notices stating that You -changed the files; and -You must retain, in the Source form of any Derivative Works that You distribute, -all copyright, patent, trademark, and attribution notices from the Source form -of the Work, excluding those notices that do not pertain to any part of the -Derivative Works; and -If the Work includes a "NOTICE" text file as part of its distribution, then any -Derivative Works that You distribute must include a readable copy of the -attribution notices contained within such NOTICE file, excluding those notices -that do not pertain to any part of the Derivative Works, in at least one of the -following places: within a NOTICE text file distributed as part of the -Derivative Works; within the Source form or documentation, if provided along -with the Derivative Works; or, within a display generated by the Derivative -Works, if and wherever such third-party notices normally appear. The contents of -the NOTICE file are for informational purposes only and do not modify the -License. You may add Your own attribution notices within Derivative Works that -You distribute, alongside or as an addendum to the NOTICE text from the Work, -provided that such additional attribution notices cannot be construed as -modifying the License. -You may add Your own copyright statement to Your modifications and may provide -additional or different license terms and conditions for use, reproduction, or -distribution of Your modifications, or for any such Derivative Works as a whole, -provided Your use, reproduction, and distribution of the Work otherwise complies -with the conditions stated in this License. - -5. Submission of Contributions. - -Unless You explicitly state otherwise, any Contribution intentionally submitted -for inclusion in the Work by You to the Licensor shall be under the terms and -conditions of this License, without any additional terms or conditions. -Notwithstanding the above, nothing herein shall supersede or modify the terms of -any separate license agreement you may have executed with Licensor regarding -such Contributions. - -6. Trademarks. - -This License does not grant permission to use the trade names, trademarks, -service marks, or product names of the Licensor, except as required for -reasonable and customary use in describing the origin of the Work and -reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. - -Unless required by applicable law or agreed to in writing, Licensor provides the -Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, -including, without limitation, any warranties or conditions of TITLE, -NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are -solely responsible for determining the appropriateness of using or -redistributing the Work and assume any risks associated with Your exercise of -permissions under this License. - -8. Limitation of Liability. - -In no event and under no legal theory, whether in tort (including negligence), -contract, or otherwise, unless required by applicable law (such as deliberate -and grossly negligent acts) or agreed to in writing, shall any Contributor be -liable to You for damages, including any direct, indirect, special, incidental, -or consequential damages of any character arising as a result of this License or -out of the use or inability to use the Work (including but not limited to -damages for loss of goodwill, work stoppage, computer failure or malfunction, or -any and all other commercial damages or losses), even if such Contributor has -been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. - -While redistributing the Work or Derivative Works thereof, You may choose to -offer, and charge a fee for, acceptance of support, warranty, indemnity, or -other liability obligations and/or rights consistent with this License. However, -in accepting such obligations, You may act only on Your own behalf and on Your -sole responsibility, not on behalf of any other Contributor, and only if You -agree to indemnify, defend, and hold each Contributor harmless for any liability -incurred by, or claims asserted against, such Contributor by reason of your -accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work - -To apply the Apache License to your work, attach the following boilerplate -notice, with the fields enclosed by brackets "[]" replaced with your own -identifying information. (Don't include the brackets!) The text should be -enclosed in the appropriate comment syntax for the file format. We also -recommend that a file or class name and description of purpose be included on -the same "printed page" as the copyright notice for easier identification within -third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index dae6107db4d92..ff0ccffced0f3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -445,16 +445,6 @@ def setup(self, engine): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self, engine): - read_csv( - self.data(self.StringIO_input), - engine=engine, - sep=",", - header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]], - ) - def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 1078837a8e395..d3fd5075a4707 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,10 +1,5 @@ -import numpy as np - try: - from pandas._libs.tslibs.parsing import ( - _does_string_look_like_datetime, - concat_date_cols, - ) + from pandas._libs.tslibs.parsing import _does_string_look_like_datetime except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) pass @@ -20,21 +15,3 @@ def setup(self, value): def time_check_datetimes(self, value): for obj in self.objects: _does_string_look_like_datetime(obj) - - -class ConcatDateCols: - params = ([1234567890, "AAAA"], [1, 2]) - param_names = ["value", "dim"] - - def setup(self, value, dim): - count_elem = 10000 - if dim == 1: - self.object = (np.array([value] * count_elem),) - if dim == 2: - self.object = ( - np.array([value] * count_elem), - np.array([value] * count_elem), - ) - - def time_check_concat(self, value, dim): - concat_date_cols(self.object) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index b021af4694d7d..85d34cac5a7bf 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -148,10 +148,14 @@ def time_searchsorted(self, dtype): class Map: - params = (["dict", "Series", "lambda"], ["object", "category", "int"]) - param_names = "mapper" - - def setup(self, mapper, dtype): + params = ( + ["dict", "Series", "lambda"], + ["object", "category", "int"], + [None, "ignore"], + ) + param_names = ["mapper", "dtype", "na_action"] + + def setup(self, mapper, dtype, na_action): map_size = 1000 map_data = Series(map_size - np.arange(map_size), dtype=dtype) @@ -168,8 +172,8 @@ def setup(self, mapper, dtype): self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype) - def time_map(self, mapper, *args, **kwargs): - self.s.map(self.map_data) + def time_map(self, mapper, dtype, na_action): + self.s.map(self.map_data, na_action=na_action) class Clip: diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 3a2baec54109a..fe31879e67a67 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -19,10 +19,15 @@ class TimeGetTimedeltaField: def setup(self, size, field): arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr + arr = np.random.randint(-86400 * 1_000_000_000, 0, size=size, dtype="i8") + self.i8data_negative = arr def time_get_timedelta_field(self, size, field): get_timedelta_field(self.i8data, field) + def time_get_timedelta_field_negative_td(self, size, field): + get_timedelta_field(self.i8data_negative, field) + class TimeGetDateField: params = [ @@ -72,3 +77,6 @@ def setup(self, size, side, period, freqstr, month_kw): def time_get_start_end_field(self, size, side, period, freqstr, month_kw): get_start_end_field(self.i8data, self.attrname, freqstr, month_kw=month_kw) + + +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cde9f9dd43280..c4f143e9be2f4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -71,71 +71,32 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.DataFrame.max RT03" \ - -i "pandas.DataFrame.mean RT03,SA01" \ - -i "pandas.DataFrame.median RT03,SA01" \ + -i "pandas.DataFrame.mean RT03" \ + -i "pandas.DataFrame.median RT03" \ -i "pandas.DataFrame.min RT03" \ - -i "pandas.DataFrame.plot PR02,SA01" \ - -i "pandas.DataFrame.sem PR01,RT03,SA01" \ - -i "pandas.DataFrame.std PR01,RT03,SA01" \ - -i "pandas.DataFrame.sum RT03" \ - -i "pandas.DataFrame.swaplevel SA01" \ - -i "pandas.DataFrame.to_markdown SA01" \ - -i "pandas.DataFrame.var PR01,RT03,SA01" \ + -i "pandas.DataFrame.plot PR02" \ -i "pandas.Grouper PR02" \ - -i "pandas.Index PR07" \ - -i "pandas.Index.get_loc PR07,RT03,SA01" \ - -i "pandas.Index.join PR07,RT03,SA01" \ - -i "pandas.Index.names GL08" \ - -i "pandas.Index.ravel PR01,RT03" \ - -i "pandas.Index.str PR01,SA01" \ - -i "pandas.Interval PR02" \ - -i "pandas.Interval.closed SA01" \ - -i "pandas.Interval.left SA01" \ - -i "pandas.Interval.mid SA01" \ - -i "pandas.Interval.right SA01" \ - -i "pandas.IntervalDtype PR01,SA01" \ - -i "pandas.IntervalDtype.subtype SA01" \ - -i "pandas.IntervalIndex.closed SA01" \ - -i "pandas.IntervalIndex.contains RT03" \ - -i "pandas.IntervalIndex.get_loc PR07,RT03,SA01" \ - -i "pandas.IntervalIndex.is_non_overlapping_monotonic SA01" \ - -i "pandas.IntervalIndex.left GL08" \ - -i "pandas.IntervalIndex.length GL08" \ - -i "pandas.IntervalIndex.mid GL08" \ - -i "pandas.IntervalIndex.right GL08" \ - -i "pandas.IntervalIndex.set_closed RT03,SA01" \ - -i "pandas.IntervalIndex.to_tuples RT03,SA01" \ - -i "pandas.MultiIndex PR01" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ - -i "pandas.MultiIndex.dtypes SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ - -i "pandas.MultiIndex.levels SA01" \ -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ - -i "pandas.MultiIndex.nlevels SA01" \ -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ - -i "pandas.MultiIndex.set_codes SA01" \ -i "pandas.MultiIndex.set_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ - -i "pandas.MultiIndex.truncate SA01" \ -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ - -i "pandas.NamedAgg SA01" \ - -i "pandas.Period SA01" \ -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ - -i "pandas.Period.is_leap_year SA01" \ -i "pandas.Period.month SA01" \ -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Period.quarter SA01" \ -i "pandas.Period.strftime PR01,SA01" \ -i "pandas.Period.to_timestamp SA01" \ -i "pandas.Period.year SA01" \ @@ -167,29 +128,17 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.RangeIndex.start SA01" \ -i "pandas.RangeIndex.step SA01" \ -i "pandas.RangeIndex.stop SA01" \ - -i "pandas.Series SA01" \ - -i "pandas.Series.__iter__ RT03,SA01" \ - -i "pandas.Series.add PR07" \ - -i "pandas.Series.backfill PR01,SA01" \ - -i "pandas.Series.case_when RT03" \ - -i "pandas.Series.cat PR07,SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ - -i "pandas.Series.cat.codes SA01" \ -i "pandas.Series.cat.remove_categories PR01,PR02" \ -i "pandas.Series.cat.remove_unused_categories PR01" \ -i "pandas.Series.cat.rename_categories PR01,PR02" \ -i "pandas.Series.cat.reorder_categories PR01,PR02" \ -i "pandas.Series.cat.set_categories PR01,PR02" \ - -i "pandas.Series.div PR07" \ -i "pandas.Series.dt.as_unit PR01,PR02" \ -i "pandas.Series.dt.ceil PR01,PR02" \ - -i "pandas.Series.dt.components SA01" \ -i "pandas.Series.dt.day_name PR01,PR02" \ - -i "pandas.Series.dt.days SA01" \ - -i "pandas.Series.dt.days_in_month SA01" \ - -i "pandas.Series.dt.daysinmonth SA01" \ -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ -i "pandas.Series.dt.microseconds SA01" \ @@ -205,47 +154,20 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.dtype SA01" \ - -i "pandas.Series.eq PR07,SA01" \ - -i "pandas.Series.floordiv PR07" \ - -i "pandas.Series.ge PR07,SA01" \ - -i "pandas.Series.gt PR07,SA01" \ - -i "pandas.Series.hasnans SA01" \ - -i "pandas.Series.is_monotonic_decreasing SA01" \ - -i "pandas.Series.is_monotonic_increasing SA01" \ - -i "pandas.Series.is_unique SA01" \ - -i "pandas.Series.kurt RT03,SA01" \ - -i "pandas.Series.kurtosis RT03,SA01" \ - -i "pandas.Series.le PR07,SA01" \ + -i "pandas.Series.ge SA01" \ + -i "pandas.Series.gt SA01" \ -i "pandas.Series.list.__getitem__ SA01" \ -i "pandas.Series.list.flatten SA01" \ -i "pandas.Series.list.len SA01" \ - -i "pandas.Series.lt PR07,SA01" \ - -i "pandas.Series.max RT03" \ - -i "pandas.Series.mean RT03,SA01" \ - -i "pandas.Series.median RT03,SA01" \ - -i "pandas.Series.min RT03" \ - -i "pandas.Series.mod PR07" \ - -i "pandas.Series.mode SA01" \ - -i "pandas.Series.mul PR07" \ - -i "pandas.Series.ne PR07,SA01" \ + -i "pandas.Series.lt SA01" \ + -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.plot PR02,SA01" \ + -i "pandas.Series.plot PR02" \ -i "pandas.Series.pop RT03,SA01" \ - -i "pandas.Series.pow PR07" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ - -i "pandas.Series.radd PR07" \ - -i "pandas.Series.rdiv PR07" \ -i "pandas.Series.reorder_levels RT03,SA01" \ - -i "pandas.Series.rfloordiv PR07" \ - -i "pandas.Series.rmod PR07" \ - -i "pandas.Series.rmul PR07" \ - -i "pandas.Series.rpow PR07" \ - -i "pandas.Series.rsub PR07" \ - -i "pandas.Series.rtruediv PR07" \ -i "pandas.Series.sem PR01,RT03,SA01" \ - -i "pandas.Series.shape SA01" \ -i "pandas.Series.skew RT03,SA01" \ -i "pandas.Series.sparse PR01,SA01" \ -i "pandas.Series.sparse.density SA01" \ @@ -255,7 +177,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.sparse.sp_values SA01" \ -i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \ -i "pandas.Series.std PR01,RT03,SA01" \ - -i "pandas.Series.str PR01,SA01" \ -i "pandas.Series.str.capitalize RT03" \ -i "pandas.Series.str.casefold RT03" \ -i "pandas.Series.str.center RT03,SA01" \ @@ -281,32 +202,22 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.strip RT03" \ -i "pandas.Series.str.swapcase RT03" \ -i "pandas.Series.str.title RT03" \ - -i "pandas.Series.str.translate RT03,SA01" \ -i "pandas.Series.str.upper RT03" \ -i "pandas.Series.str.wrap RT03,SA01" \ -i "pandas.Series.str.zfill RT03" \ -i "pandas.Series.struct.dtypes SA01" \ - -i "pandas.Series.sub PR07" \ - -i "pandas.Series.sum RT03" \ - -i "pandas.Series.swaplevel SA01" \ -i "pandas.Series.to_dict SA01" \ -i "pandas.Series.to_frame SA01" \ -i "pandas.Series.to_markdown SA01" \ - -i "pandas.Series.to_string SA01" \ - -i "pandas.Series.truediv PR07" \ -i "pandas.Series.update PR07,SA01" \ - -i "pandas.Series.var PR01,RT03,SA01" \ - -i "pandas.SparseDtype SA01" \ - -i "pandas.Timedelta PR07,SA01" \ - -i "pandas.Timedelta.as_unit SA01" \ -i "pandas.Timedelta.asm8 SA01" \ -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ -i "pandas.Timedelta.days SA01" \ -i "pandas.Timedelta.floor SA01" \ - -i "pandas.Timedelta.max PR02,PR07,SA01" \ - -i "pandas.Timedelta.min PR02,PR07,SA01" \ - -i "pandas.Timedelta.resolution PR02,PR07,SA01" \ + -i "pandas.Timedelta.max PR02" \ + -i "pandas.Timedelta.min PR02" \ + -i "pandas.Timedelta.resolution PR02" \ -i "pandas.Timedelta.round SA01" \ -i "pandas.Timedelta.to_numpy PR01" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ @@ -314,40 +225,23 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.view SA01" \ -i "pandas.TimedeltaIndex.as_unit RT03,SA01" \ -i "pandas.TimedeltaIndex.components SA01" \ - -i "pandas.TimedeltaIndex.days SA01" \ -i "pandas.TimedeltaIndex.microseconds SA01" \ -i "pandas.TimedeltaIndex.nanoseconds SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ - -i "pandas.Timestamp PR07,SA01" \ - -i "pandas.Timestamp.as_unit SA01" \ - -i "pandas.Timestamp.asm8 SA01" \ - -i "pandas.Timestamp.astimezone SA01" \ - -i "pandas.Timestamp.ceil SA01" \ -i "pandas.Timestamp.combine PR01,SA01" \ -i "pandas.Timestamp.ctime SA01" \ -i "pandas.Timestamp.date SA01" \ -i "pandas.Timestamp.day GL08" \ - -i "pandas.Timestamp.day_name SA01" \ - -i "pandas.Timestamp.day_of_week SA01" \ - -i "pandas.Timestamp.day_of_year SA01" \ - -i "pandas.Timestamp.dayofweek SA01" \ - -i "pandas.Timestamp.dayofyear SA01" \ - -i "pandas.Timestamp.days_in_month SA01" \ - -i "pandas.Timestamp.daysinmonth SA01" \ - -i "pandas.Timestamp.dst SA01" \ -i "pandas.Timestamp.floor SA01" \ -i "pandas.Timestamp.fold GL08" \ -i "pandas.Timestamp.fromordinal SA01" \ -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ -i "pandas.Timestamp.hour GL08" \ - -i "pandas.Timestamp.is_leap_year SA01" \ - -i "pandas.Timestamp.isocalendar SA01" \ - -i "pandas.Timestamp.isoformat SA01" \ -i "pandas.Timestamp.isoweekday SA01" \ - -i "pandas.Timestamp.max PR02,PR07,SA01" \ + -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.microsecond GL08" \ - -i "pandas.Timestamp.min PR02,PR07,SA01" \ + -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.minute GL08" \ -i "pandas.Timestamp.month GL08" \ -i "pandas.Timestamp.month_name SA01" \ @@ -356,10 +250,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.now SA01" \ -i "pandas.Timestamp.quarter SA01" \ -i "pandas.Timestamp.replace PR07,SA01" \ - -i "pandas.Timestamp.resolution PR02,PR07,SA01" \ - -i "pandas.Timestamp.round SA01" \ + -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.second GL08" \ - -i "pandas.Timestamp.strftime SA01" \ -i "pandas.Timestamp.strptime PR01,SA01" \ -i "pandas.Timestamp.time SA01" \ -i "pandas.Timestamp.timestamp SA01" \ @@ -369,40 +261,25 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.to_julian_date SA01" \ -i "pandas.Timestamp.to_numpy PR01" \ -i "pandas.Timestamp.to_period PR01,SA01" \ - -i "pandas.Timestamp.to_pydatetime PR01,SA01" \ -i "pandas.Timestamp.today SA01" \ -i "pandas.Timestamp.toordinal SA01" \ -i "pandas.Timestamp.tz SA01" \ - -i "pandas.Timestamp.tz_convert SA01" \ -i "pandas.Timestamp.tz_localize SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.tzname SA01" \ -i "pandas.Timestamp.unit SA01" \ -i "pandas.Timestamp.utcfromtimestamp PR01,SA01" \ - -i "pandas.Timestamp.utcnow SA01" \ -i "pandas.Timestamp.utcoffset SA01" \ -i "pandas.Timestamp.utctimetuple SA01" \ -i "pandas.Timestamp.value GL08" \ - -i "pandas.Timestamp.week SA01" \ - -i "pandas.Timestamp.weekday SA01" \ - -i "pandas.Timestamp.weekofyear SA01" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.extensions.ExtensionArray SA01" \ - -i "pandas.api.extensions.ExtensionArray._accumulate RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray._concat_same_type PR07,SA01" \ - -i "pandas.api.extensions.ExtensionArray._formatter SA01" \ - -i "pandas.api.extensions.ExtensionArray._from_sequence SA01" \ - -i "pandas.api.extensions.ExtensionArray._from_sequence_of_strings SA01" \ - -i "pandas.api.extensions.ExtensionArray._hash_pandas_object RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._pad_or_backfill PR01,RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._reduce RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._values_for_factorize SA01" \ -i "pandas.api.extensions.ExtensionArray.astype SA01" \ - -i "pandas.api.extensions.ExtensionArray.copy RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.dropna RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.dtype SA01" \ -i "pandas.api.extensions.ExtensionArray.duplicated RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.equals SA01" \ -i "pandas.api.extensions.ExtensionArray.fillna SA01" \ -i "pandas.api.extensions.ExtensionArray.insert PR07,RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ @@ -411,25 +288,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.nbytes SA01" \ -i "pandas.api.extensions.ExtensionArray.ndim SA01" \ -i "pandas.api.extensions.ExtensionArray.ravel RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.shape SA01" \ - -i "pandas.api.extensions.ExtensionArray.shift SA01" \ -i "pandas.api.extensions.ExtensionArray.take RT03" \ -i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.view SA01" \ - -i "pandas.api.extensions.register_extension_dtype SA01" \ - -i "pandas.api.indexers.BaseIndexer PR01,SA01" \ - -i "pandas.api.indexers.FixedForwardWindowIndexer PR01,SA01" \ -i "pandas.api.indexers.VariableOffsetWindowIndexer PR01,SA01" \ -i "pandas.api.interchange.from_dataframe RT03,SA01" \ - -i "pandas.api.types.infer_dtype PR07,SA01" \ -i "pandas.api.types.is_any_real_numeric_dtype SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ -i "pandas.api.types.is_bool_dtype SA01" \ -i "pandas.api.types.is_categorical_dtype SA01" \ -i "pandas.api.types.is_complex PR01,SA01" \ -i "pandas.api.types.is_complex_dtype SA01" \ - -i "pandas.api.types.is_datetime64_any_dtype SA01" \ -i "pandas.api.types.is_datetime64_dtype SA01" \ -i "pandas.api.types.is_datetime64_ns_dtype SA01" \ -i "pandas.api.types.is_datetime64tz_dtype SA01" \ @@ -451,11 +321,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_period_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.is_scalar SA01" \ -i "pandas.api.types.is_signed_integer_dtype SA01" \ -i "pandas.api.types.is_sparse SA01" \ -i "pandas.api.types.is_string_dtype SA01" \ - -i "pandas.api.types.is_timedelta64_dtype SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ -i "pandas.api.types.is_unsigned_integer_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ @@ -465,19 +333,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.DatetimeArray SA01" \ -i "pandas.arrays.FloatingArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ - -i "pandas.arrays.IntervalArray.closed SA01" \ - -i "pandas.arrays.IntervalArray.contains RT03" \ - -i "pandas.arrays.IntervalArray.is_non_overlapping_monotonic SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ -i "pandas.arrays.IntervalArray.mid SA01" \ -i "pandas.arrays.IntervalArray.right SA01" \ - -i "pandas.arrays.IntervalArray.set_closed RT03,SA01" \ - -i "pandas.arrays.IntervalArray.to_tuples RT03,SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.bdate_range RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ @@ -493,7 +355,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.prod SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ @@ -511,7 +373,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.prod SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ @@ -538,7 +400,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.window.rolling.Window.std PR01" \ -i "pandas.core.window.rolling.Window.var PR01" \ -i "pandas.date_range RT03" \ - -i "pandas.describe_option SA01" \ -i "pandas.errors.AbstractMethodError PR01,SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ @@ -566,9 +427,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.get_option SA01" \ -i "pandas.infer_freq SA01" \ - -i "pandas.interval_range RT03" \ -i "pandas.io.formats.style.Styler.apply RT03" \ -i "pandas.io.formats.style.Styler.apply_index RT03" \ -i "pandas.io.formats.style.Styler.background_gradient RT03" \ @@ -603,127 +462,74 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.merge PR07" \ - -i "pandas.merge_asof PR07,RT03" \ - -i "pandas.merge_ordered PR07" \ - -i "pandas.option_context SA01" \ -i "pandas.period_range RT03,SA01" \ - -i "pandas.pivot PR07" \ - -i "pandas.pivot_table PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ - -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ - -i "pandas.plotting.plot_params SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ - -i "pandas.plotting.table PR07,RT03,SA01" \ - -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_feather SA01" \ - -i "pandas.read_orc SA01" \ - -i "pandas.read_sas SA01" \ - -i "pandas.read_spss SA01" \ - -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.set_option SA01" \ - -i "pandas.show_versions SA01" \ - -i "pandas.test SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ - -i "pandas.testing.assert_index_equal PR07,SA01" \ - -i "pandas.testing.assert_series_equal PR07,SA01" \ - -i "pandas.timedelta_range SA01" \ - -i "pandas.tseries.api.guess_datetime_format SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ - -i "pandas.tseries.offsets.BMonthBegin PR02" \ - -i "pandas.tseries.offsets.BMonthEnd PR02" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ - -i "pandas.tseries.offsets.BQuarterBegin.copy SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BQuarterBegin.kwds SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ - -i "pandas.tseries.offsets.BQuarterBegin.name SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.nanos GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd PR02" \ - -i "pandas.tseries.offsets.BQuarterEnd.copy SA01" \ -i "pandas.tseries.offsets.BQuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd.kwds SA01" \ -i "pandas.tseries.offsets.BQuarterEnd.n GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd.name SA01" \ -i "pandas.tseries.offsets.BQuarterEnd.nanos GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.normalize GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.rule_code GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.startingMonth GL08" \ - -i "pandas.tseries.offsets.BYearBegin PR02" \ - -i "pandas.tseries.offsets.BYearBegin.copy SA01" \ -i "pandas.tseries.offsets.BYearBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BYearBegin.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BYearBegin.kwds SA01" \ -i "pandas.tseries.offsets.BYearBegin.month GL08" \ -i "pandas.tseries.offsets.BYearBegin.n GL08" \ - -i "pandas.tseries.offsets.BYearBegin.name SA01" \ -i "pandas.tseries.offsets.BYearBegin.nanos GL08" \ -i "pandas.tseries.offsets.BYearBegin.normalize GL08" \ -i "pandas.tseries.offsets.BYearBegin.rule_code GL08" \ -i "pandas.tseries.offsets.BYearEnd PR02" \ - -i "pandas.tseries.offsets.BYearEnd.copy SA01" \ -i "pandas.tseries.offsets.BYearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BYearEnd.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BYearEnd.kwds SA01" \ -i "pandas.tseries.offsets.BYearEnd.month GL08" \ -i "pandas.tseries.offsets.BYearEnd.n GL08" \ - -i "pandas.tseries.offsets.BYearEnd.name SA01" \ -i "pandas.tseries.offsets.BYearEnd.nanos GL08" \ -i "pandas.tseries.offsets.BYearEnd.normalize GL08" \ -i "pandas.tseries.offsets.BYearEnd.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessDay PR02,SA01" \ -i "pandas.tseries.offsets.BusinessDay.calendar GL08" \ - -i "pandas.tseries.offsets.BusinessDay.copy SA01" \ -i "pandas.tseries.offsets.BusinessDay.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessDay.holidays GL08" \ -i "pandas.tseries.offsets.BusinessDay.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BusinessDay.kwds SA01" \ -i "pandas.tseries.offsets.BusinessDay.n GL08" \ - -i "pandas.tseries.offsets.BusinessDay.name SA01" \ -i "pandas.tseries.offsets.BusinessDay.nanos GL08" \ -i "pandas.tseries.offsets.BusinessDay.normalize GL08" \ -i "pandas.tseries.offsets.BusinessDay.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessDay.weekmask GL08" \ -i "pandas.tseries.offsets.BusinessHour PR02,SA01" \ -i "pandas.tseries.offsets.BusinessHour.calendar GL08" \ - -i "pandas.tseries.offsets.BusinessHour.copy SA01" \ -i "pandas.tseries.offsets.BusinessHour.end GL08" \ -i "pandas.tseries.offsets.BusinessHour.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessHour.holidays GL08" \ -i "pandas.tseries.offsets.BusinessHour.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BusinessHour.kwds SA01" \ -i "pandas.tseries.offsets.BusinessHour.n GL08" \ - -i "pandas.tseries.offsets.BusinessHour.name SA01" \ -i "pandas.tseries.offsets.BusinessHour.nanos GL08" \ -i "pandas.tseries.offsets.BusinessHour.normalize GL08" \ -i "pandas.tseries.offsets.BusinessHour.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessHour.start GL08" \ -i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin PR02" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.copy SA01" \ -i "pandas.tseries.offsets.BusinessMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.kwds SA01" \ -i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.name SA01" \ -i "pandas.tseries.offsets.BusinessMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.BusinessMonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd PR02" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.copy SA01" \ -i "pandas.tseries.offsets.BusinessMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.kwds SA01" \ -i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.name SA01" \ -i "pandas.tseries.offsets.BusinessMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.BusinessMonthEnd.normalize GL08" \ -i "pandas.tseries.offsets.BusinessMonthEnd.rule_code GL08" \ @@ -732,27 +538,21 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CDay PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.copy SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.is_on_offset GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.kwds SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.name SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.normalize GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.copy SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.end GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.is_on_offset GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.kwds SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.name SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.normalize GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.rule_code GL08" \ @@ -760,71 +560,52 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CustomBusinessHour.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.copy SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.kwds SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.name SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.copy SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.kwds SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.name SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.weekmask GL08" \ -i "pandas.tseries.offsets.DateOffset PR02" \ - -i "pandas.tseries.offsets.DateOffset.copy SA01" \ -i "pandas.tseries.offsets.DateOffset.freqstr SA01" \ -i "pandas.tseries.offsets.DateOffset.is_on_offset GL08" \ - -i "pandas.tseries.offsets.DateOffset.kwds SA01" \ -i "pandas.tseries.offsets.DateOffset.n GL08" \ - -i "pandas.tseries.offsets.DateOffset.name SA01" \ -i "pandas.tseries.offsets.DateOffset.nanos GL08" \ -i "pandas.tseries.offsets.DateOffset.normalize GL08" \ -i "pandas.tseries.offsets.DateOffset.rule_code GL08" \ - -i "pandas.tseries.offsets.Day PR02" \ - -i "pandas.tseries.offsets.Day.copy SA01" \ -i "pandas.tseries.offsets.Day.freqstr SA01" \ -i "pandas.tseries.offsets.Day.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Day.kwds SA01" \ -i "pandas.tseries.offsets.Day.n GL08" \ - -i "pandas.tseries.offsets.Day.name SA01" \ -i "pandas.tseries.offsets.Day.nanos SA01" \ -i "pandas.tseries.offsets.Day.normalize GL08" \ -i "pandas.tseries.offsets.Day.rule_code GL08" \ -i "pandas.tseries.offsets.Easter PR02" \ - -i "pandas.tseries.offsets.Easter.copy SA01" \ -i "pandas.tseries.offsets.Easter.freqstr SA01" \ -i "pandas.tseries.offsets.Easter.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Easter.kwds SA01" \ -i "pandas.tseries.offsets.Easter.n GL08" \ - -i "pandas.tseries.offsets.Easter.name SA01" \ -i "pandas.tseries.offsets.Easter.nanos GL08" \ -i "pandas.tseries.offsets.Easter.normalize GL08" \ -i "pandas.tseries.offsets.Easter.rule_code GL08" \ -i "pandas.tseries.offsets.FY5253 PR02" \ - -i "pandas.tseries.offsets.FY5253.copy SA01" \ -i "pandas.tseries.offsets.FY5253.freqstr SA01" \ -i "pandas.tseries.offsets.FY5253.get_rule_code_suffix GL08" \ -i "pandas.tseries.offsets.FY5253.get_year_end GL08" \ -i "pandas.tseries.offsets.FY5253.is_on_offset GL08" \ - -i "pandas.tseries.offsets.FY5253.kwds SA01" \ -i "pandas.tseries.offsets.FY5253.n GL08" \ - -i "pandas.tseries.offsets.FY5253.name SA01" \ -i "pandas.tseries.offsets.FY5253.nanos GL08" \ -i "pandas.tseries.offsets.FY5253.normalize GL08" \ -i "pandas.tseries.offsets.FY5253.rule_code GL08" \ @@ -832,14 +613,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.FY5253.variation GL08" \ -i "pandas.tseries.offsets.FY5253.weekday GL08" \ -i "pandas.tseries.offsets.FY5253Quarter PR02" \ - -i "pandas.tseries.offsets.FY5253Quarter.copy SA01" \ -i "pandas.tseries.offsets.FY5253Quarter.freqstr SA01" \ -i "pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.get_weeks GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.is_on_offset GL08" \ - -i "pandas.tseries.offsets.FY5253Quarter.kwds SA01" \ -i "pandas.tseries.offsets.FY5253Quarter.n GL08" \ - -i "pandas.tseries.offsets.FY5253Quarter.name SA01" \ -i "pandas.tseries.offsets.FY5253Quarter.nanos GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.normalize GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.qtr_with_extra_week GL08" \ @@ -849,198 +627,138 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \ -i "pandas.tseries.offsets.Hour PR02" \ - -i "pandas.tseries.offsets.Hour.copy SA01" \ -i "pandas.tseries.offsets.Hour.freqstr SA01" \ -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Hour.kwds SA01" \ -i "pandas.tseries.offsets.Hour.n GL08" \ - -i "pandas.tseries.offsets.Hour.name SA01" \ -i "pandas.tseries.offsets.Hour.nanos SA01" \ -i "pandas.tseries.offsets.Hour.normalize GL08" \ -i "pandas.tseries.offsets.Hour.rule_code GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth PR02,SA01" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.copy SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.freqstr SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.kwds SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.name SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.nanos GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.rule_code GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.week GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.weekday GL08" \ -i "pandas.tseries.offsets.Micro PR02" \ - -i "pandas.tseries.offsets.Micro.copy SA01" \ -i "pandas.tseries.offsets.Micro.freqstr SA01" \ -i "pandas.tseries.offsets.Micro.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Micro.kwds SA01" \ -i "pandas.tseries.offsets.Micro.n GL08" \ - -i "pandas.tseries.offsets.Micro.name SA01" \ -i "pandas.tseries.offsets.Micro.nanos SA01" \ -i "pandas.tseries.offsets.Micro.normalize GL08" \ -i "pandas.tseries.offsets.Micro.rule_code GL08" \ -i "pandas.tseries.offsets.Milli PR02" \ - -i "pandas.tseries.offsets.Milli.copy SA01" \ -i "pandas.tseries.offsets.Milli.freqstr SA01" \ -i "pandas.tseries.offsets.Milli.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Milli.kwds SA01" \ -i "pandas.tseries.offsets.Milli.n GL08" \ - -i "pandas.tseries.offsets.Milli.name SA01" \ -i "pandas.tseries.offsets.Milli.nanos SA01" \ -i "pandas.tseries.offsets.Milli.normalize GL08" \ -i "pandas.tseries.offsets.Milli.rule_code GL08" \ -i "pandas.tseries.offsets.Minute PR02" \ - -i "pandas.tseries.offsets.Minute.copy SA01" \ -i "pandas.tseries.offsets.Minute.freqstr SA01" \ -i "pandas.tseries.offsets.Minute.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Minute.kwds SA01" \ -i "pandas.tseries.offsets.Minute.n GL08" \ - -i "pandas.tseries.offsets.Minute.name SA01" \ -i "pandas.tseries.offsets.Minute.nanos SA01" \ -i "pandas.tseries.offsets.Minute.normalize GL08" \ -i "pandas.tseries.offsets.Minute.rule_code GL08" \ -i "pandas.tseries.offsets.MonthBegin PR02" \ - -i "pandas.tseries.offsets.MonthBegin.copy SA01" \ -i "pandas.tseries.offsets.MonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.MonthBegin.is_on_offset GL08" \ - -i "pandas.tseries.offsets.MonthBegin.kwds SA01" \ -i "pandas.tseries.offsets.MonthBegin.n GL08" \ - -i "pandas.tseries.offsets.MonthBegin.name SA01" \ -i "pandas.tseries.offsets.MonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.MonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.MonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.MonthEnd PR02" \ - -i "pandas.tseries.offsets.MonthEnd.copy SA01" \ -i "pandas.tseries.offsets.MonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \ - -i "pandas.tseries.offsets.MonthEnd.kwds SA01" \ -i "pandas.tseries.offsets.MonthEnd.n GL08" \ - -i "pandas.tseries.offsets.MonthEnd.name SA01" \ -i "pandas.tseries.offsets.MonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.MonthEnd.normalize GL08" \ -i "pandas.tseries.offsets.MonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.Nano PR02" \ - -i "pandas.tseries.offsets.Nano.copy SA01" \ -i "pandas.tseries.offsets.Nano.freqstr SA01" \ -i "pandas.tseries.offsets.Nano.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Nano.kwds SA01" \ -i "pandas.tseries.offsets.Nano.n GL08" \ - -i "pandas.tseries.offsets.Nano.name SA01" \ -i "pandas.tseries.offsets.Nano.nanos SA01" \ -i "pandas.tseries.offsets.Nano.normalize GL08" \ -i "pandas.tseries.offsets.Nano.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterBegin PR02" \ - -i "pandas.tseries.offsets.QuarterBegin.copy SA01" \ -i "pandas.tseries.offsets.QuarterBegin.freqstr SA01" \ -i "pandas.tseries.offsets.QuarterBegin.is_on_offset GL08" \ - -i "pandas.tseries.offsets.QuarterBegin.kwds SA01" \ -i "pandas.tseries.offsets.QuarterBegin.n GL08" \ - -i "pandas.tseries.offsets.QuarterBegin.name SA01" \ -i "pandas.tseries.offsets.QuarterBegin.nanos GL08" \ -i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.QuarterEnd PR02" \ - -i "pandas.tseries.offsets.QuarterEnd.copy SA01" \ -i "pandas.tseries.offsets.QuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \ - -i "pandas.tseries.offsets.QuarterEnd.kwds SA01" \ -i "pandas.tseries.offsets.QuarterEnd.n GL08" \ - -i "pandas.tseries.offsets.QuarterEnd.name SA01" \ -i "pandas.tseries.offsets.QuarterEnd.nanos GL08" \ -i "pandas.tseries.offsets.QuarterEnd.normalize GL08" \ -i "pandas.tseries.offsets.QuarterEnd.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterEnd.startingMonth GL08" \ -i "pandas.tseries.offsets.Second PR02" \ - -i "pandas.tseries.offsets.Second.copy SA01" \ -i "pandas.tseries.offsets.Second.freqstr SA01" \ -i "pandas.tseries.offsets.Second.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Second.kwds SA01" \ -i "pandas.tseries.offsets.Second.n GL08" \ - -i "pandas.tseries.offsets.Second.name SA01" \ -i "pandas.tseries.offsets.Second.nanos SA01" \ -i "pandas.tseries.offsets.Second.normalize GL08" \ -i "pandas.tseries.offsets.Second.rule_code GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin PR02,SA01" \ - -i "pandas.tseries.offsets.SemiMonthBegin.copy SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin.kwds SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin.name SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd PR02,SA01" \ - -i "pandas.tseries.offsets.SemiMonthEnd.copy SA01" \ + -i "pandas.tseries.offsets.SemiMonthEnd SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd.kwds SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd.name SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.Tick GL08" \ - -i "pandas.tseries.offsets.Tick.copy SA01" \ -i "pandas.tseries.offsets.Tick.freqstr SA01" \ -i "pandas.tseries.offsets.Tick.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Tick.kwds SA01" \ -i "pandas.tseries.offsets.Tick.n GL08" \ - -i "pandas.tseries.offsets.Tick.name SA01" \ -i "pandas.tseries.offsets.Tick.nanos SA01" \ -i "pandas.tseries.offsets.Tick.normalize GL08" \ -i "pandas.tseries.offsets.Tick.rule_code GL08" \ -i "pandas.tseries.offsets.Week PR02" \ - -i "pandas.tseries.offsets.Week.copy SA01" \ -i "pandas.tseries.offsets.Week.freqstr SA01" \ -i "pandas.tseries.offsets.Week.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Week.kwds SA01" \ -i "pandas.tseries.offsets.Week.n GL08" \ - -i "pandas.tseries.offsets.Week.name SA01" \ -i "pandas.tseries.offsets.Week.nanos GL08" \ -i "pandas.tseries.offsets.Week.normalize GL08" \ -i "pandas.tseries.offsets.Week.rule_code GL08" \ -i "pandas.tseries.offsets.Week.weekday GL08" \ -i "pandas.tseries.offsets.WeekOfMonth PR02,SA01" \ - -i "pandas.tseries.offsets.WeekOfMonth.copy SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.freqstr SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth.kwds SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth.name SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.nanos GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.rule_code GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.week GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.weekday GL08" \ - -i "pandas.tseries.offsets.YearBegin PR02" \ - -i "pandas.tseries.offsets.YearBegin.copy SA01" \ -i "pandas.tseries.offsets.YearBegin.freqstr SA01" \ -i "pandas.tseries.offsets.YearBegin.is_on_offset GL08" \ - -i "pandas.tseries.offsets.YearBegin.kwds SA01" \ -i "pandas.tseries.offsets.YearBegin.month GL08" \ -i "pandas.tseries.offsets.YearBegin.n GL08" \ - -i "pandas.tseries.offsets.YearBegin.name SA01" \ -i "pandas.tseries.offsets.YearBegin.nanos GL08" \ -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.YearEnd PR02" \ - -i "pandas.tseries.offsets.YearEnd.copy SA01" \ -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ - -i "pandas.tseries.offsets.YearEnd.kwds SA01" \ -i "pandas.tseries.offsets.YearEnd.month GL08" \ -i "pandas.tseries.offsets.YearEnd.n GL08" \ - -i "pandas.tseries.offsets.YearEnd.name SA01" \ -i "pandas.tseries.offsets.YearEnd.nanos GL08" \ -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ - -i "pandas.unique PR07" \ - -i "pandas.util.hash_array PR07,SA01" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-311-arm64.yaml similarity index 98% rename from ci/deps/circle-310-arm64.yaml rename to ci/deps/circle-311-arm64.yaml index ed4d139714e71..1c31d353699f8 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.10 + - python=3.11 # build dependencies - versioneer[toml] diff --git a/doc/data/titanic.csv b/doc/data/titanic.csv index 5cc466e97cf12..0f7d184728a17 100644 --- a/doc/data/titanic.csv +++ b/doc/data/titanic.csv @@ -1,93 +1,93 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S 2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +3,1,3,"Heikkinen, Miss Laina",female,26,0,0,STON/O2. 3101282,7.925,,S 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S 5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S 6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q 7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +8,0,3,"Palsson, Master Gosta Leonard",male,2,3,1,349909,21.075,,S 9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S 10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +11,1,3,"Sandstrom, Miss Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss Elizabeth",female,58,0,0,113783,26.55,C103,S 13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S 14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +15,0,3,"Vestrom, Miss Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S 16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +17,0,3,"Rice, Master Eugene",male,2,4,1,382652,29.125,,Q 18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S 19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S 20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C 21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S 22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +23,1,3,"McGowan, Miss Anna ""Annie""",female,15,0,0,330923,8.0292,,Q 24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +25,0,3,"Palsson, Miss Torborg Danira",female,8,3,1,349909,21.075,,S 26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S 27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C 28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +29,1,3,"O'Dwyer, Miss Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q 30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S 31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C 32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +33,1,3,"Glynn, Miss Mary Agatha",female,,0,0,335677,7.75,,Q 34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S 35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C 36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S 37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C 38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +39,0,3,"Vander Planke, Miss Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss Jamila",female,14,1,0,2651,11.2417,,C 41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S 42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S 43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +44,1,2,"Laroche, Miss Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss Margaret Delia",female,19,0,0,330958,7.8792,,Q 46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S 47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +48,1,3,"O'Driscoll, Miss Bridget",female,,0,0,14311,7.75,,Q 49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C 50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +51,0,3,"Panula, Master Juha Niilo",male,7,4,1,3101295,39.6875,,S 52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S 53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C 54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S 55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C 56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +57,1,2,"Rugg, Miss Emily",female,21,0,0,C.A. 31026,10.5,,S 58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +59,1,2,"West, Miss Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master William Frederick",male,11,5,2,CA 2144,46.9,,S 61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +62,1,1,"Icard, Miss Amelie",female,38,0,0,113572,80,B28, 63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +64,0,3,"Skoog, Master Harald",male,4,3,2,347088,27.9,,S 65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +66,1,3,"Moubarek, Master Gerios",male,,1,1,2661,15.2458,,C 67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S 68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +69,1,3,"Andersson, Miss Erna Alexandra",female,17,4,2,3101281,7.925,,S 70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S 71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +72,0,3,"Goodwin, Miss Lillian Amy",female,16,5,2,CA 2144,46.9,,S 73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S 74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C 75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S 76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S 77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S 78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +79,1,2,"Caldwell, Master Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss Elizabeth",female,30,0,0,364516,12.475,,S 81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S 82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +83,1,3,"McDermott, Miss Brigdet Delia",female,,0,0,330932,7.7875,,Q 84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +85,1,2,"Ilett, Miss Bertha",female,17,0,0,SO/C 14885,10.5,,S 86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S 87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S 88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +89,1,1,"Fortune, Miss Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S 90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S 91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S 92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S @@ -99,35 +99,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C 99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S 100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +101,0,3,"Petranec, Miss Matilda",female,28,0,0,349245,7.8958,,S 102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S 103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S 104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S 105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S 106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +107,1,3,"Salkjelsvik, Miss Anna Kristine",female,21,0,0,343120,7.65,,S 108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S 109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +110,1,3,"Moran, Miss Bertha",female,,1,0,371110,24.15,,Q 111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +112,0,3,"Zabour, Miss Hileni",female,14.5,1,0,2665,14.4542,,C 113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +114,0,3,"Jussila, Miss Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss Malake",female,17,0,0,2627,14.4583,,C 116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S 117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q 118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S 119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +120,0,3,"Andersson, Miss Ellis Anna Maria",female,2,4,2,347082,31.275,,S 121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S 122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S 123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +124,1,2,"Webber, Miss Susan",female,32.5,0,0,27267,13,E101,S 125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +126,1,3,"Nicola-Yarred, Master Elias",male,12,1,0,2651,11.2417,,C 127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q 128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +129,1,3,"Peter, Miss Anna",female,,1,1,2668,22.3583,F E69,C 130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S 131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C 132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S @@ -135,18 +135,18 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S 135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S 136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +137,1,1,"Newsom, Miss Helen Monypeny",female,19,0,2,11752,26.2833,D47,S 138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S 139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S 140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C 141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +142,1,3,"Nysten, Miss Anna Sofia",female,22,0,0,347081,7.75,,S 143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S 144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q 145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S 146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S 147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +148,0,3,"Ford, Miss Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S 149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S 150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S 151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S @@ -155,35 +155,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S 155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S 156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +157,1,3,"Gilnagh, Miss Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q 158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S 159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +160,0,3,"Sage, Master Thomas Henry",male,,8,2,CA. 2343,69.55,,S 161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S 162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S 163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S 164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +165,0,3,"Panula, Master Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S 167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S 168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S 169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S 170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S 171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +172,0,3,"Rice, Master Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss Eleanor Ileen",female,1,1,1,347742,11.1333,,S 174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S 175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C 176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +177,0,3,"Lefebre, Master Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C 179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S 180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +181,0,3,"Sage, Miss Constance Gladys",female,,8,2,CA. 2343,69.55,,S 182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +183,0,3,"Asplund, Master Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss Luise Gretchen",female,4,0,2,315153,22.025,,S 186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S 187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q 188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S @@ -191,33 +191,33 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S 191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S 192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +193,1,3,"Andersen-Jensen, Miss Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master Michel M",male,3,1,1,230080,26,F2,S 195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +196,1,1,"Lurette, Miss Elise",female,58,0,0,PC 17569,146.5208,B80,C 197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q 198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +199,1,3,"Madigan, Miss Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S 201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S 202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S 203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S 204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C 205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +206,0,3,"Strom, Miss Telma Matilda",female,2,0,1,347054,10.4625,G6,S 207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S 208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +209,1,3,"Carr, Miss Helen ""Ellen""",female,16,0,0,367231,7.75,,Q 210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C 211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +212,1,2,"Cameron, Miss Clear Annie",female,35,0,0,F.C.C. 13528,21,,S 213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S 214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S 215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +216,1,1,"Newell, Miss Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S 218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +219,1,1,"Bazzani, Miss Albina",female,32,0,0,11813,76.2917,D15,C 220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S 221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S 222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S @@ -228,24 +228,24 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S 228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S 229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +230,0,3,"Lefebre, Miss Mathilde",female,,3,1,4133,25.4667,,S 231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S 232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S 233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +234,1,3,"Asplund, Miss Lillian Gertrud",female,5,4,2,347077,31.3875,,S 235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +236,0,3,"Harknett, Miss Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S 237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +238,1,2,"Collyer, Miss Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S 239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S 240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +241,0,3,"Zabour, Miss Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss Katherine ""Kate""",female,,1,0,367230,15.5,,Q 243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S 244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S 245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C 246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +247,0,3,"Lindahl, Miss Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S 248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S 249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S 250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S @@ -256,28 +256,28 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S 256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C 257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +258,1,1,"Cherry, Miss Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss Anna",female,35,0,0,PC 17755,512.3292,,C 260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S 261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +262,1,3,"Asplund, Master Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S 263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S 264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +265,0,3,"Henry, Miss Delia",female,,0,0,382649,7.75,,Q 266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S 267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S 268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S 269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +270,1,1,"Bissette, Miss Amelia",female,35,0,0,PC 17760,135.6333,C99,S 271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S 272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S 273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S 274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +275,1,3,"Healy, Miss Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss Augusta Charlotta",female,45,0,0,347073,7.75,,S 278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +279,0,3,"Rice, Master Eric",male,7,4,1,382652,29.125,,Q 280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S 281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q 282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S @@ -288,66 +288,66 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S 288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S 289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +290,1,3,"Connolly, Miss Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss Ellen ""Nellie""",female,26,0,0,19877,78.85,,S 292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C 293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +294,0,3,"Haas, Miss Aloisia",female,24,0,0,349236,8.85,,S 295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S 296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C 297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +298,0,1,"Allison, Miss Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S 299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S 300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +301,1,3,"Kelly, Miss Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q 302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q 303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +304,1,2,"Keane, Miss Nora A",female,,0,0,226593,12.35,E101,Q 305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +306,1,1,"Allison, Master Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss Margaret",female,,0,0,17421,110.8833,,C 308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C 309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +310,1,1,"Francatelli, Miss Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C 313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S 314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S 315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +316,1,3,"Nilsson, Miss Helmina Josefina",female,26,0,0,347470,7.8542,,S 317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S 318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +319,1,1,"Wick, Miss Mary Natalie",female,31,0,2,36928,164.8667,C7,S 320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C 321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S 322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +323,1,2,"Slayter, Miss Hilda Mary",female,30,0,0,234818,12.35,,Q 324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S 325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +326,1,1,"Young, Miss Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C 327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S 328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S 329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +330,1,1,"Hippach, Miss Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss Agnes",female,,2,0,367226,23.25,,Q 332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S 333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S 334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S 335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S 336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S 337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +338,1,1,"Burns, Miss Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C 339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S 340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +341,1,2,"Navratil, Master Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S 343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S 344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S 345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +346,1,2,"Brown, Miss Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss Marion Elsie",female,40,0,0,31418,13,,S 348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +349,1,3,"Coutts, Master William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S 350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S 351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S 352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S @@ -355,10 +355,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S 355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C 356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +357,1,1,"Bowerman, Miss Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q 361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S 362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C 363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C @@ -367,58 +367,58 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S 367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C 368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +369,1,3,"Jermyn, Miss Annie",female,,0,0,14313,7.75,,Q 370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C 371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C 372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S 373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S 374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +375,0,3,"Palsson, Miss Stina Viola",female,3,3,1,349909,21.075,,S 376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +377,1,3,"Landergren, Miss Aurora Adelia",female,22,0,0,C 7077,7.25,,S 378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C 379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C 380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +381,1,1,"Bidois, Miss Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss Maria (""Mary"")",female,1,0,2,2653,15.7417,,C 383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S 384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S 385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S 386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +387,0,3,"Goodwin, Master Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss Kate",female,36,0,0,27849,13,,S 389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +390,1,2,"Lehmann, Miss Bertha",female,17,0,0,SC 1748,12,,C 391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S 392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S 393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +394,1,1,"Newell, Miss Marjorie",female,23,1,0,35273,113.275,D36,C 395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S 396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +397,0,3,"Olsson, Miss Elina",female,31,0,0,350407,7.8542,,S 398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S 399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S 400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S 401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S 402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +403,0,3,"Jussila, Miss Mari Aina",female,21,1,0,4137,9.825,,S 404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +405,0,3,"Oreskovic, Miss Marija",female,20,0,0,315096,8.6625,,S 406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S 407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +408,1,2,"Richards, Master William Rowe",male,3,1,1,29106,18.75,,S 409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +410,0,3,"Lefebre, Miss Ida",female,,3,1,4133,25.4667,,S 411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S 412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +413,1,1,"Minahan, Miss Daisy E",female,33,1,0,19928,90,C78,Q 414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S 415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S 416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S 417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +418,1,2,"Silven, Miss Lyyli Karoliina",female,18,0,2,250652,13,,S 419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +420,0,3,"Van Impe, Miss Catharina",female,10,0,2,345773,24.15,,S 421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C 422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q 423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S @@ -426,7 +426,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S 426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S 427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +428,1,2,"Phillips, Miss Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S 429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q 430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S 431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S @@ -434,8 +434,8 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S 434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S 435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +436,1,1,"Carter, Miss Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S 438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S 439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S 440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S @@ -444,10 +444,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S 444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S 445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +446,1,1,"Dodge, Master Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss Madeleine Violet",female,13,0,1,250644,19.5,,S 448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +449,1,3,"Baclini, Miss Marie Catherine",female,5,2,1,2666,19.2583,,C 450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S 451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S 452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S @@ -457,7 +457,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C 457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S 458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +459,1,2,"Toomey, Miss Ellen",female,50,0,0,F.C.C. 13531,10.5,,S 460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q 461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S 462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S @@ -468,42 +468,42 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S 468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S 469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +470,1,3,"Baclini, Miss Helene Barbara",female,0.75,2,1,2666,19.2583,,C 471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S 472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S 473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S 474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +475,0,3,"Strandberg, Miss Ida Sofia",female,22,0,0,7553,9.8375,,S 476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S 477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S 478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S 479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +480,1,3,"Hirvonen, Miss Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master Harold Victor",male,9,5,2,CA 2144,46.9,,S 482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S 483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S 484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S 485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +486,0,3,"Lefebre, Miss Jeannie",female,,3,1,4133,25.4667,,S 487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S 488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C 489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +490,1,3,"Coutts, Master Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S 491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S 492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S 493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S 494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C 495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S 496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +497,1,1,"Eustis, Miss Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C 498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S 499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S 500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S 501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +502,0,3,"Canavan, Miss Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss Roberta",female,16,0,0,110152,86.5,B79,S 506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C 507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S 508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S @@ -519,41 +519,41 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q 519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S 520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +521,1,1,"Perreault, Miss Anne",female,30,0,0,12749,93.5,B73,S 522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S 523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C 524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C 525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C 526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +527,1,2,"Ridsdale, Miss Lucy",female,50,0,0,W./C. 14258,10.5,,S 528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S 529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S 530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +531,1,2,"Quick, Miss Phyllis May",female,2,1,1,26360,26,,S 532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C 533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C 534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +535,0,3,"Cacic, Miss Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S 537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +538,1,1,"LeRoy, Miss Bertha",female,30,0,0,PC 17761,106.425,,C 539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +540,1,1,"Frolicher, Miss Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss Sigrid Elisabeth",female,11,4,2,347082,31.275,,S 544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S 545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C 546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S 547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S 548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C 549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +550,1,2,"Davies, Master John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S 551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C 552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S 553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q 554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +555,1,3,"Ohman, Miss Velin",female,22,0,0,347085,7.775,,S 556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S 557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C 558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C @@ -563,7 +563,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S 563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S 564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +565,0,3,"Meanwell, Miss (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S 566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S 567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S 568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S @@ -572,19 +572,19 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S 572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S 573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +574,1,3,"Kelly, Miss Mary",female,,0,0,14312,7.75,,Q 575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S 576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +577,1,2,"Garside, Miss Ethel",female,34,0,0,243880,13,,S 578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S 579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C 580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +581,1,2,"Christy, Miss Julie Rachel",female,25,1,1,237789,30,,S 582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C 583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S 584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C 585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +586,1,1,"Taussig, Miss Ruth",female,18,0,2,110413,79.65,E68,S 587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S 588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C 589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S @@ -592,10 +592,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S 592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C 593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +594,0,3,"Bourke, Miss Mary",female,,0,2,364848,7.75,,Q 595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S 596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +597,1,2,"Leitch, Miss Jessie Wills",female,,0,0,248727,33,,S 598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S 599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C 600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C @@ -608,16 +608,16 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S 608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S 609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +610,1,1,"Shutes, Miss Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S 611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S 612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +613,1,3,"Murphy, Miss Margaret Jane",female,,1,0,367230,15.5,,Q 614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q 615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +616,1,2,"Herman, Miss Alice",female,24,1,2,220845,65,,S 617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S 618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +619,1,2,"Becker, Miss Marion Louise",female,4,2,1,230136,39,F4,S 620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S 621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C 622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S @@ -626,34 +626,34 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S 626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S 627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +628,1,1,"Longley, Miss Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S 629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S 630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q 631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S 632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S 633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C 634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +635,0,3,"Skoog, Miss Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss Mary",female,28,0,0,237668,13,,S 637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S 638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S 639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S 640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S 641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S 642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +643,0,3,"Skoog, Miss Margit Elizabeth",female,2,3,2,347088,27.9,,S 644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +645,1,3,"Baclini, Miss Eugenie",female,0.75,2,1,2666,19.2583,,C 646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C 647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S 648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C 649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +650,1,3,"Stanley, Miss Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S 651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +652,1,2,"Doling, Miss Elsie",female,18,0,1,231919,23,,S 653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +654,1,3,"O'Leary, Miss Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss Hanora ""Nora""",female,18,0,0,365226,6.75,,Q 656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S 657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S 658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q @@ -676,10 +676,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S 676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S 677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +678,1,3,"Turja, Miss Anna Sofia",female,18,0,0,4138,9.8417,,S 679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S 680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +681,0,3,"Peters, Miss Katie",female,,0,0,330935,8.1375,,Q 682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C 683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S 684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S @@ -688,48 +688,48 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S 688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S 689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +690,1,1,"Madill, Miss Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S 691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +692,1,3,"Karun, Miss Manca",female,4,0,1,349256,13.4167,,C 693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S 694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C 695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S 696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S 697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +698,1,3,"Mullens, Miss Katherine ""Katie""",female,,0,0,35852,7.7333,,Q 699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C 700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S 701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C 702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +703,0,3,"Barbara, Miss Saiide",female,18,0,1,2691,14.4542,,C 704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q 705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S 706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S 707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S 708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +709,1,1,"Cleaver, Miss Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C 711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C 712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S 713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S 714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S 715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S 716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +717,1,1,"Endres, Miss Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S 719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q 720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +721,1,2,"Harper, Miss Annie Jessie ""Nina""",female,6,0,1,248727,33,,S 722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S 723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S 724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S 725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S 726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S 727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +728,1,3,"Mannion, Miss Margareth",female,,0,0,36866,7.7375,,Q 729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +730,0,3,"Ilmakangas, Miss Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S 732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C 733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S 734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S @@ -741,20 +741,20 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S 741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S 742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +743,1,1,"Ryerson, Miss Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C 744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S 745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S 746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S 747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +748,1,2,"Sinkkonen, Miss Anna",female,30,0,0,250648,13,,S 749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S 750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +751,1,2,"Wells, Miss Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master Meier",male,6,0,1,392096,12.475,E121,S 753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S 754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S 755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +756,1,2,"Hamalainen, Master Viljo",male,0.67,1,1,250649,14.5,,S 757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S 758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S 759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S @@ -766,7 +766,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S 766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S 767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +768,0,3,"Mangan, Miss Mary",female,30.5,0,0,364850,7.75,,Q 769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q 770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S 771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S @@ -776,22 +776,22 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S 776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S 777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +778,1,3,"Emanuel, Miss Virginia Ethel",female,5,0,0,364516,12.475,,S 779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q 780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +781,1,3,"Ayoub, Miss Banoura",female,13,0,0,2687,7.2292,,C 782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S 783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S 784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S 785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S 786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +787,1,3,"Sjoblom, Miss Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S 790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C 791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q 792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +793,0,3,"Sage, Miss Stella Anna",female,,8,2,CA. 2343,69.55,,S 794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C 795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S 796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S @@ -801,47 +801,47 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S 801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S 802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +803,1,1,"Carter, Master William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master Assad Alexander",male,0.42,0,1,2625,8.5167,,C 805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S 806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S 807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +808,0,3,"Pettersson, Miss Ellen Natalia",female,18,0,0,347087,7.775,,S 809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S 810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S 811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S 812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S 813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +814,0,3,"Andersson, Miss Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S 815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S 816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +817,0,3,"Heininen, Miss Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S 818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C 819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +820,0,3,"Skoog, Master Karl Thorsten",male,10,3,2,347088,27.9,,S 821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S 822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S 823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S 824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +825,0,3,"Panula, Master Urho Abraham",male,2,4,1,3101295,39.6875,,S 826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q 827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +828,1,2,"Mallet, Master Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C 829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q 830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, 831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +832,1,2,"Richards, Master George Sibley",male,0.83,1,1,29106,18.75,,S 833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C 834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S 835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +836,1,1,"Compton, Miss Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C 837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S 838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S 839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S 840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C 841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S 842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +843,1,1,"Serepeca, Miss Augusta",female,30,0,0,113798,31,,C 844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C 845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S 846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S @@ -849,10 +849,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C 849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S 850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +851,0,3,"Andersson, Master Sigvard Harald Elias",male,4,4,2,347082,31.275,,S 852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +853,0,3,"Boulos, Miss Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss Mary Conover",female,16,0,1,PC 17592,39.4,D28,S 855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S 856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S 857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S @@ -862,31 +862,31 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S 862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S 863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +864,0,3,"Sage, Miss Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S 865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S 866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +867,1,2,"Duran y More, Miss Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C 868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S 869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +870,1,3,"Johnson, Master Harold Theodor",male,4,1,1,347742,11.1333,,S 871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S 872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S 873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S 874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S 875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +876,1,3,"Najib, Miss Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C 877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S 878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S 879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S 880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C 881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S 882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +883,0,3,"Dahlberg, Miss Gerda Ulrika",female,22,0,0,7552,10.5167,,S 884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S 885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S 886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +888,1,1,"Graham, Miss Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/doc/source/_static/schemas/01_table_spreadsheet.png b/doc/source/_static/schemas/01_table_spreadsheet.png index b3cf5a0245b9c..4e3497879de31 100644 Binary files a/doc/source/_static/schemas/01_table_spreadsheet.png and b/doc/source/_static/schemas/01_table_spreadsheet.png differ diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index f6ff95aa72c6c..fbcf017d608ce 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -84,7 +84,7 @@ Here's a typical workflow for triaging a newly opened issue. example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports for a good explanation. If the example is not reproducible, or if it's *clearly* not minimal, feel free to ask the reporter if they can provide - and example or simplify the provided one. Do acknowledge that writing + an example or simplify the provided one. Do acknowledge that writing minimal reproducible examples is hard work. If the reporter is struggling, you can try to write one yourself and we'll edit the original post to include it. @@ -93,6 +93,9 @@ Here's a typical workflow for triaging a newly opened issue. If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + If this is a regression report, post the result of a ``git bisect`` run. + More info on this can be found in the :ref:`maintaining.regressions` section. + Ensure the issue exists on the main branch and that it has the "Needs Triage" tag until all steps have been completed. Add a comment to the issue once you have verified it exists on the main branch, so others know it has been confirmed. @@ -125,7 +128,10 @@ Here's a typical workflow for triaging a newly opened issue. If the issue is clearly defined and the fix seems relatively straightforward, label the issue as "Good first issue". - Once you have completed the above, make sure to remove the "needs triage" label. + If the issue is a regression report, add the "Regression" label and the next patch + release milestone. + + Once you have completed the above, make sure to remove the "Needs Triage" label. .. _maintaining.regressions: @@ -462,9 +468,9 @@ Post-Release the appropriate ones for the version you are releasing): - Log in to the server and use the correct user. - - `cd /var/www/html/pandas-docs/` - - `ln -sfn version/2.1 stable` (for a major or minor release) - - `ln -sfn version/2.0.3 version/2.0` (for a patch release) + - ``cd /var/www/html/pandas-docs/`` + - ``ln -sfn version/2.1 stable`` (for a major or minor release) + - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release) 2. If releasing a major or minor release, open a PR in our source code to update ``web/pandas/versions.json``, to have the desired versions in the documentation diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 8465820452353..a3665c5bb4d1f 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -46,10 +46,12 @@ deprecation removed in the next major release (2.0.0). These policies do not apply to features marked as **experimental** in the documentation. pandas may change the behavior of experimental features at any time. +.. _policies.python_support: + Python support ~~~~~~~~~~~~~~ -pandas mirrors the `NumPy guidelines for Python support `__. +pandas mirrors the `SPEC 0 guideline for Python support `__. Security policy ~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index d9cb1de14aded..9f29f7f4f4406 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -134,8 +134,8 @@ to explore, clean, and process your data. In pandas, a data table is called a :c
-pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these -data sources is provided by function with the prefix ``read_*``. Similarly, the ``to_*`` methods are used to store data. +pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). The ability to import data from each of these +data sources is provided by functions with the prefix, ``read_*``. Similarly, the ``to_*`` methods are used to store data. .. image:: ../_static/schemas/02_io_readwrite.svg :align: center @@ -181,7 +181,7 @@ data sources is provided by function with the prefix ``read_*``. Similarly, the
-Selecting or filtering specific rows and/or columns? Filtering the data on a condition? Methods for slicing, selecting, and extracting the +Selecting or filtering specific rows and/or columns? Filtering the data on a particular condition? Methods for slicing, selecting, and extracting the data you need are available in pandas. .. image:: ../_static/schemas/03_subset_columns_rows.svg @@ -228,7 +228,7 @@ data you need are available in pandas.
-pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) +pandas provides plotting for your data right out of the box with the power of Matplotlib. Simply pick the plot type (scatter, bar, boxplot,...) corresponding to your data. .. image:: ../_static/schemas/04_plot_overview.svg @@ -275,7 +275,7 @@ corresponding to your data.
-There is no need to loop over all rows of your data table to do calculations. Data manipulations on a column work elementwise. +There's no need to loop over all rows of your data table to do calculations. Column data manipulations work elementwise in pandas. Adding a column to a :class:`DataFrame` based on existing data in other columns is straightforward. .. image:: ../_static/schemas/05_newcolumn_2.svg @@ -322,7 +322,7 @@ Adding a column to a :class:`DataFrame` based on existing data in other columns
-Basic statistics (mean, median, min, max, counts...) are easily calculable. These or custom aggregations can be applied on the entire +Basic statistics (mean, median, min, max, counts...) are easily calculable across data frames. These, or even custom aggregations, can be applied on the entire data set, a sliding window of the data, or grouped by categories. The latter is also known as the split-apply-combine approach. .. image:: ../_static/schemas/06_groupby.svg @@ -369,8 +369,8 @@ data set, a sliding window of the data, or grouped by categories. The latter is
-Change the structure of your data table in multiple ways. You can :func:`~pandas.melt` your data table from wide to long/tidy form or :func:`~pandas.pivot` -from long to wide format. With aggregations built-in, a pivot table is created with a single command. +Change the structure of your data table in a variety of ways. You can use :func:`~pandas.melt` to reshape your data from a wide format to a long and tidy one. Use :func:`~pandas.pivot` + to go from long to wide format. With aggregations built-in, a pivot table can be created with a single command. .. image:: ../_static/schemas/07_melt.svg :align: center @@ -416,7 +416,7 @@ from long to wide format. With aggregations built-in, a pivot table is created w
-Multiple tables can be concatenated both column wise and row wise as database-like join/merge operations are provided to combine multiple tables of data. +Multiple tables can be concatenated column wise or row wise with pandas' database-like join and merge operations. .. image:: ../_static/schemas/08_concat_row.svg :align: center @@ -505,7 +505,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. +Data sets often contain more than just numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html @@ -551,9 +551,9 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `R programming language `__ provides the - ``data.frame`` data structure and multiple packages, such as - `tidyverse `__ use and extend ``data.frame`` + The `R programming language `__ provides a + ``data.frame`` data structure as well as packages like + `tidyverse `__ which use and extend ``data.frame`` for convenient data handling functionalities similar to pandas. +++ @@ -572,8 +572,8 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - Already familiar to ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? - Most of these SQL manipulations do have equivalents in pandas. + Already familiar with ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? + Many SQL manipulations have equivalents in pandas. +++ @@ -631,10 +631,10 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `SAS `__ statistical software suite - also provides the ``data set`` corresponding to the pandas ``DataFrame``. - Also SAS vectorized operations, filtering, string processing operations, - and more have similar functions in pandas. + `SAS `__, the statistical software suite, + uses the ``data set`` structure, which closely corresponds pandas' ``DataFrame``. + Also SAS vectorized operations such as filtering or string processing operations + have similar functions in pandas. +++ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index cf5f15ceb8344..01a79fc8e36fd 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -21,7 +21,7 @@ Instructions for installing :ref:`from source `, Python version support ---------------------- -Officially Python 3.9, 3.10, 3.11 and 3.12. +See :ref:`Python support policy `. Installing pandas ----------------- diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index caaff3557ae40..efcdb22778ef4 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -46,7 +46,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno "Name": [ "Braund, Mr. Owen Harris", "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth", + "Bonnell, Miss Elizabeth", ], "Age": [22, 35, 58], "Sex": ["male", "male", "female"], @@ -192,8 +192,8 @@ Check more options on ``describe`` in the user guide section about :ref:`aggrega .. note:: This is just a starting point. Similar to spreadsheet software, pandas represents data as a table with columns and rows. Apart - from the representation, also the data manipulations and calculations - you would do in spreadsheet software are supported by pandas. Continue + from the representation, the data manipulations and calculations + you would do in spreadsheet software are also supported by pandas. Continue reading the next tutorials to get started! .. raw:: html @@ -204,7 +204,7 @@ Check more options on ``describe`` in the user guide section about :ref:`aggrega - Import the package, aka ``import pandas as pd`` - A table of data is stored as a pandas ``DataFrame`` - Each column in a ``DataFrame`` is a ``Series`` -- You can do things by applying a method to a ``DataFrame`` or ``Series`` +- You can do things by applying a method on a ``DataFrame`` or ``Series`` .. raw:: html @@ -215,7 +215,7 @@ Check more options on ``describe`` in the user guide section about :ref:`aggrega
To user guide -A more extended explanation to ``DataFrame`` and ``Series`` is provided in the :ref:`introduction to data structures `. +A more extended explanation of ``DataFrame`` and ``Series`` is provided in the :ref:`introduction to data structures ` page. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index aa032b186aeb9..0549c17a1013c 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -172,11 +172,11 @@ The method :meth:`~DataFrame.info` provides technical information about a - The table has 12 columns. Most columns have a value for each of the rows (all 891 values are ``non-null``). Some columns do have missing values and less than 891 ``non-null`` values. -- The columns ``Name``, ``Sex``, ``Cabin`` and ``Embarked`` consists of +- The columns ``Name``, ``Sex``, ``Cabin`` and ``Embarked`` consist of textual data (strings, aka ``object``). The other columns are - numerical data with some of them whole numbers (aka ``integer``) and - others are real numbers (aka ``float``). -- The kind of data (characters, integers,…) in the different columns + numerical data, some of them are whole numbers (``integer``) and + others are real numbers (``float``). +- The kind of data (characters, integers, …) in the different columns are summarized by listing the ``dtypes``. - The approximate amount of RAM used to hold the DataFrame is provided as well. @@ -194,7 +194,7 @@ The method :meth:`~DataFrame.info` provides technical information about a - Getting data in to pandas from many different file formats or data sources is supported by ``read_*`` functions. - Exporting data out of pandas is provided by different - ``to_*``\ methods. + ``to_*`` methods. - The ``head``/``tail``/``info`` methods and the ``dtypes`` attribute are convenient for a first check. diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst index 88d7d653c9e83..ce7aa629a89fc 100644 --- a/doc/source/getting_started/intro_tutorials/03_subset_data.rst +++ b/doc/source/getting_started/intro_tutorials/03_subset_data.rst @@ -300,7 +300,7 @@ want to select. -When using the column names, row labels or a condition expression, use +When using column names, row labels or a condition expression, use the ``loc`` operator in front of the selection brackets ``[]``. For both the part before and after the comma, you can use a single label, a list of labels, a slice of labels, a conditional expression or a colon. Using @@ -342,7 +342,7 @@ the name ``anonymous`` to the first 3 elements of the fourth column:
To user guide -See the user guide section on :ref:`different choices for indexing ` to get more insight in the usage of ``loc`` and ``iloc``. +See the user guide section on :ref:`different choices for indexing ` to get more insight into the usage of ``loc`` and ``iloc``. .. raw:: html @@ -357,10 +357,8 @@ See the user guide section on :ref:`different choices for indexing ` in combination with the :meth:`~DataFrame.plot` method. Hence, the :meth:`~DataFrame.plot` method works on both ``Series`` and ``DataFrame``. @@ -127,7 +127,7 @@ standard Python to get an overview of the available plot methods: ] .. note:: - In many development environments as well as IPython and + In many development environments such as IPython and Jupyter Notebook, use the TAB button to get an overview of the available methods, for example ``air_quality.plot.`` + TAB. @@ -238,7 +238,7 @@ This strategy is applied in the previous example: - The ``.plot.*`` methods are applicable on both Series and DataFrames. - By default, each of the columns is plotted as a different element - (line, boxplot,…). + (line, boxplot, …). - Any plot created by pandas is a Matplotlib object. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/05_add_columns.rst b/doc/source/getting_started/intro_tutorials/05_add_columns.rst index 3e0f75b210dbb..481c094870e12 100644 --- a/doc/source/getting_started/intro_tutorials/05_add_columns.rst +++ b/doc/source/getting_started/intro_tutorials/05_add_columns.rst @@ -89,8 +89,8 @@ values in each row*. -Also other mathematical operators (``+``, ``-``, ``*``, ``/``,…) or -logical operators (``<``, ``>``, ``==``,…) work element-wise. The latter was already +Other mathematical operators (``+``, ``-``, ``*``, ``/``, …) and logical +operators (``<``, ``>``, ``==``, …) also work element-wise. The latter was already used in the :ref:`subset data tutorial <10min_tut_03_subset>` to filter rows of a table using a conditional expression. diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index 668925ce79252..1399ab66426f4 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -235,7 +235,7 @@ category in a column. -The function is a shortcut, as it is actually a groupby operation in combination with counting of the number of records +The function is a shortcut, it is actually a groupby operation in combination with counting the number of records within each group: .. ipython:: python diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 9081f274cd941..05729809491b5 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -137,7 +137,7 @@ Hence, the resulting table has 3178 = 1110 + 2068 rows. Most operations like concatenation or summary statistics are by default across rows (axis 0), but can be applied across columns as well. -Sorting the table on the datetime information illustrates also the +Sorting the table on the datetime information also illustrates the combination of both tables, with the ``parameter`` column defining the origin of the table (either ``no2`` from table ``air_quality_no2`` or ``pm25`` from table ``air_quality_pm25``): @@ -286,7 +286,7 @@ between the two tables.
To user guide -pandas supports also inner, outer, and right joins. +pandas also supports inner, outer, and right joins. More information on join/merge of tables is provided in the user guide section on :ref:`database style merging of tables `. Or have a look at the :ref:`comparison with SQL` page. @@ -300,7 +300,7 @@ More information on join/merge of tables is provided in the user guide section o

REMEMBER

-- Multiple tables can be concatenated both column-wise and row-wise using +- Multiple tables can be concatenated column-wise or row-wise using the ``concat`` function. - For database-like merging/joining of tables, use the ``merge`` function. diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index b0530087e5b84..6ba3c17fac3c3 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -77,9 +77,9 @@ I want to work with the dates in the column ``datetime`` as datetime objects ins Initially, the values in ``datetime`` are character strings and do not provide any datetime operations (e.g. extract the year, day of the -week,…). By applying the ``to_datetime`` function, pandas interprets the +week, …). By applying the ``to_datetime`` function, pandas interprets the strings and convert these to datetime (i.e. ``datetime64[ns, UTC]``) -objects. In pandas we call these datetime objects similar to +objects. In pandas we call these datetime objects that are similar to ``datetime.datetime`` from the standard library as :class:`pandas.Timestamp`. .. raw:: html @@ -117,7 +117,7 @@ length of our time series: air_quality["datetime"].max() - air_quality["datetime"].min() The result is a :class:`pandas.Timedelta` object, similar to ``datetime.timedelta`` -from the standard Python library and defining a time duration. +from the standard Python library which defines a time duration. .. raw:: html @@ -257,7 +257,7 @@ the adapted time scale on plots. Let’s apply this on our data.
  • -Create a plot of the :math:`NO_2` values in the different stations from the 20th of May till the end of 21st of May +Create a plot of the :math:`NO_2` values in the different stations from May 20th till the end of May 21st. .. ipython:: python :okwarning: @@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in .. ipython:: python - monthly_max = no_2.resample("ME").max() + monthly_max = no_2.resample("MS").max() monthly_max A very powerful method on time series data with a datetime index, is the @@ -310,7 +310,7 @@ converting secondly data into 5-minutely data). The :meth:`~Series.resample` method is similar to a groupby operation: - it provides a time-based grouping, by using a string (e.g. ``M``, - ``5H``,…) that defines the target frequency + ``5H``, …) that defines the target frequency - it requires an aggregation function such as ``mean``, ``max``,… .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 5b1885791d8fb..8493a071863c4 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -134,8 +134,8 @@ only one countess on the Titanic, we get one row as a result. .. note:: More powerful extractions on strings are supported, as the :meth:`Series.str.contains` and :meth:`Series.str.extract` methods accept `regular - expressions `__, but out of - scope of this tutorial. + expressions `__, but are out of + the scope of this tutorial. .. raw:: html @@ -200,7 +200,7 @@ In the "Sex" column, replace values of "male" by "M" and values of "female" by " Whereas :meth:`~Series.replace` is not a string method, it provides a convenient way to use mappings or vocabularies to translate certain values. It requires -a ``dictionary`` to define the mapping ``{from : to}``. +a ``dictionary`` to define the mapping ``{from: to}``. .. raw:: html diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 3cdcb81c14961..887ffd5580a52 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -101,7 +101,7 @@ truncated for brevity. Viewing data ------------ -See the :ref:`Essentially basics functionality section `. +See the :ref:`Essential basic functionality section `. Use :meth:`DataFrame.head` and :meth:`DataFrame.tail` to view the top and bottom rows of the frame respectively: diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 92799359a61d2..5cdc9779ef4e1 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -160,11 +160,10 @@ Here is a sample (using 100 column x 100,000 row ``DataFrames``): .. csv-table:: :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" :widths: 25, 25, 25, 25 - :delim: ; - ``df1 > df2``; 13.32; 125.35; 0.1063 - ``df1 * df2``; 21.71; 36.63; 0.5928 - ``df1 + df2``; 22.04; 36.50; 0.6039 + ``df1 > df2``, 13.32, 125.35, 0.1063 + ``df1 * df2``, 21.71, 36.63, 0.5928 + ``df1 + df2``, 22.04, 36.50, 0.6039 You are highly encouraged to install both libraries. See the section :ref:`Recommended Dependencies ` for more installation info. @@ -1607,7 +1606,7 @@ For instance: This method does not convert the row to a Series object; it merely returns the values inside a namedtuple. Therefore, :meth:`~DataFrame.itertuples` preserves the data type of the values -and is generally faster as :meth:`~DataFrame.iterrows`. +and is generally faster than :meth:`~DataFrame.iterrows`. .. note:: diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 3c361d4de17e5..7de0430123fd2 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -37,6 +37,19 @@ If you would prefer to keep the ``NA`` values you can manually fill them with `` s[mask.fillna(True)] +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="boolean")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + .. _boolean.kleene: Kleene logical operations diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 99c85ac66623d..26eb656357bf6 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -315,19 +315,8 @@ Why not make NumPy like R? Many people have suggested that NumPy should simply emulate the ``NA`` support present in the more domain-specific statistical programming language `R -`__. Part of the reason is the NumPy type hierarchy: - -.. csv-table:: - :header: "Typeclass","Dtypes" - :widths: 30,70 - :delim: | - - ``numpy.floating`` | ``float16, float32, float64, float128`` - ``numpy.integer`` | ``int8, int16, int32, int64`` - ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` - ``numpy.object_`` | ``object_`` - ``numpy.bool_`` | ``bool_`` - ``numpy.character`` | ``bytes_, str_`` +`__. Part of the reason is the +`NumPy type hierarchy `__. The R language, by contrast, only has a handful of built-in data types: ``integer``, ``numeric`` (floating-point), ``character``, and diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 8c222aff52fd7..267499edfae6f 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -506,29 +506,28 @@ listed below, those with a ``*`` do *not* have an efficient, GroupBy-specific, i .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.any`;Compute whether any of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.all`;Compute whether all of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups - :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups - :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group - :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group - :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group - :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group - :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group - :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group - :meth:`~.DataFrameGroupBy.median`;Compute the median of each group - :meth:`~.DataFrameGroupBy.min`;Compute the minimum value in each group - :meth:`~.DataFrameGroupBy.nunique`;Compute the number of unique values in each group - :meth:`~.DataFrameGroupBy.prod`;Compute the product of the values in each group - :meth:`~.DataFrameGroupBy.quantile`;Compute a given quantile of the values in each group - :meth:`~.DataFrameGroupBy.sem`;Compute the standard error of the mean of the values in each group - :meth:`~.DataFrameGroupBy.size`;Compute the number of values in each group - :meth:`~.DataFrameGroupBy.skew` *;Compute the skew of the values in each group - :meth:`~.DataFrameGroupBy.std`;Compute the standard deviation of the values in each group - :meth:`~.DataFrameGroupBy.sum`;Compute the sum of the values in each group - :meth:`~.DataFrameGroupBy.var`;Compute the variance of the values in each group + + :meth:`~.DataFrameGroupBy.any`,Compute whether any of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.all`,Compute whether all of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.count`,Compute the number of non-NA values in the groups + :meth:`~.DataFrameGroupBy.cov` * ,Compute the covariance of the groups + :meth:`~.DataFrameGroupBy.first`,Compute the first occurring value in each group + :meth:`~.DataFrameGroupBy.idxmax`,Compute the index of the maximum value in each group + :meth:`~.DataFrameGroupBy.idxmin`,Compute the index of the minimum value in each group + :meth:`~.DataFrameGroupBy.last`,Compute the last occurring value in each group + :meth:`~.DataFrameGroupBy.max`,Compute the maximum value in each group + :meth:`~.DataFrameGroupBy.mean`,Compute the mean of each group + :meth:`~.DataFrameGroupBy.median`,Compute the median of each group + :meth:`~.DataFrameGroupBy.min`,Compute the minimum value in each group + :meth:`~.DataFrameGroupBy.nunique`,Compute the number of unique values in each group + :meth:`~.DataFrameGroupBy.prod`,Compute the product of the values in each group + :meth:`~.DataFrameGroupBy.quantile`,Compute a given quantile of the values in each group + :meth:`~.DataFrameGroupBy.sem`,Compute the standard error of the mean of the values in each group + :meth:`~.DataFrameGroupBy.size`,Compute the number of values in each group + :meth:`~.DataFrameGroupBy.skew` * ,Compute the skew of the values in each group + :meth:`~.DataFrameGroupBy.std`,Compute the standard deviation of the values in each group + :meth:`~.DataFrameGroupBy.sum`,Compute the sum of the values in each group + :meth:`~.DataFrameGroupBy.var`,Compute the variance of the values in each group Some examples: @@ -832,19 +831,18 @@ The following methods on GroupBy act as transformations. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.bfill`;Back fill NA values within each group - :meth:`~.DataFrameGroupBy.cumcount`;Compute the cumulative count within each group - :meth:`~.DataFrameGroupBy.cummax`;Compute the cumulative max within each group - :meth:`~.DataFrameGroupBy.cummin`;Compute the cumulative min within each group - :meth:`~.DataFrameGroupBy.cumprod`;Compute the cumulative product within each group - :meth:`~.DataFrameGroupBy.cumsum`;Compute the cumulative sum within each group - :meth:`~.DataFrameGroupBy.diff`;Compute the difference between adjacent values within each group - :meth:`~.DataFrameGroupBy.ffill`;Forward fill NA values within each group - :meth:`~.DataFrameGroupBy.pct_change`;Compute the percent change between adjacent values within each group - :meth:`~.DataFrameGroupBy.rank`;Compute the rank of each value within each group - :meth:`~.DataFrameGroupBy.shift`;Shift values up or down within each group + + :meth:`~.DataFrameGroupBy.bfill`,Back fill NA values within each group + :meth:`~.DataFrameGroupBy.cumcount`,Compute the cumulative count within each group + :meth:`~.DataFrameGroupBy.cummax`,Compute the cumulative max within each group + :meth:`~.DataFrameGroupBy.cummin`,Compute the cumulative min within each group + :meth:`~.DataFrameGroupBy.cumprod`,Compute the cumulative product within each group + :meth:`~.DataFrameGroupBy.cumsum`,Compute the cumulative sum within each group + :meth:`~.DataFrameGroupBy.diff`,Compute the difference between adjacent values within each group + :meth:`~.DataFrameGroupBy.ffill`,Forward fill NA values within each group + :meth:`~.DataFrameGroupBy.pct_change`,Compute the percent change between adjacent values within each group + :meth:`~.DataFrameGroupBy.rank`,Compute the rank of each value within each group + :meth:`~.DataFrameGroupBy.shift`,Shift values up or down within each group In addition, passing any built-in aggregation method as a string to :meth:`~.DataFrameGroupBy.transform` (see the next section) will broadcast the result @@ -1092,11 +1090,10 @@ efficient, GroupBy-specific, implementation. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - :meth:`~.DataFrameGroupBy.head`;Select the top row(s) of each group - :meth:`~.DataFrameGroupBy.nth`;Select the nth row(s) of each group - :meth:`~.DataFrameGroupBy.tail`;Select the bottom row(s) of each group + :meth:`~.DataFrameGroupBy.head`,Select the top row(s) of each group + :meth:`~.DataFrameGroupBy.nth`,Select the nth row(s) of each group + :meth:`~.DataFrameGroupBy.tail`,Select the bottom row(s) of each group Users can also use transformations along with Boolean indexing to construct complex filtrations within groups. For example, suppose we are given groups of products and diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 0da87e1d31fec..503f7cc7cbe73 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -94,13 +94,14 @@ well). Any of the axes accessors may be the null slice ``:``. Axes left out of the specification are assumed to be ``:``, e.g. ``p.loc['a']`` is equivalent to ``p.loc['a', :]``. -.. csv-table:: - :header: "Object Type", "Indexers" - :widths: 30, 50 - :delim: ; - Series; ``s.loc[indexer]`` - DataFrame; ``df.loc[row_indexer,column_indexer]`` +.. ipython:: python + + ser = pd.Series(range(5), index=list("abcde")) + ser.loc[["a", "c", "e"]] + + df = pd.DataFrame(np.arange(25).reshape(5, 5), index=list("abcde"), columns=list("abcde")) + df.loc[["a", "c", "e"], ["b", "d"]] .. _indexing.basics: @@ -116,10 +117,9 @@ indexing pandas objects with ``[]``: .. csv-table:: :header: "Object Type", "Selection", "Return Value Type" :widths: 30, 30, 60 - :delim: ; - Series; ``series[label]``; scalar value - DataFrame; ``frame[colname]``; ``Series`` corresponding to colname + Series, ``series[label]``, scalar value + DataFrame, ``frame[colname]``, ``Series`` corresponding to colname Here we construct a simple time series data set to use for illustrating the indexing functionality: @@ -403,9 +403,9 @@ are returned: s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4]) s.loc[3:5] -If at least one of the two is absent, but the index is sorted, and can be -compared against start and stop labels, then slicing will still work as -expected, by selecting labels which *rank* between the two: +If the index is sorted, and can be compared against start and stop labels, +then slicing will still work as expected, by selecting labels which *rank* +between the two: .. ipython:: python @@ -1711,6 +1711,6 @@ Why does assignment fail when using chained indexing? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :ref:`Copy-on-Write ` is the new default with pandas 3.0. -This means than chained indexing will never work. +This means that chained indexing will never work. See :ref:`this section ` for more context. diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 1a727cd78af09..76a2f22b7987d 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -84,6 +84,19 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="Int64")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + Operations ---------- diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 49609a80d7e15..c523f3a641d91 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -16,26 +16,25 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like .. csv-table:: :header: "Format Type", "Data Description", "Reader", "Writer" :widths: 30, 100, 60, 60 - :delim: ; - - text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` - text;Fixed-Width Text File;:ref:`read_fwf` - text;`JSON `__;:ref:`read_json`;:ref:`to_json` - text;`HTML `__;:ref:`read_html`;:ref:`to_html` - text;`LaTeX `__;;:ref:`Styler.to_latex` - text;`XML `__;:ref:`read_xml`;:ref:`to_xml` - text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` - binary;`OpenDocument `__;:ref:`read_excel`; - binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` - binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` - binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`;:ref:`to_orc` - binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` - binary;`SAS `__;:ref:`read_sas`; - binary;`SPSS `__;:ref:`read_spss`; - binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle` - SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql` + + text,`CSV `__, :ref:`read_csv`, :ref:`to_csv` + text,Fixed-Width Text File, :ref:`read_fwf` , NA + text,`JSON `__, :ref:`read_json`, :ref:`to_json` + text,`HTML `__, :ref:`read_html`, :ref:`to_html` + text,`LaTeX `__, :ref:`Styler.to_latex` , NA + text,`XML `__, :ref:`read_xml`, :ref:`to_xml` + text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard` + binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel` + binary,`OpenDocument `__, :ref:`read_excel`, NA + binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf` + binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather` + binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet` + binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc` + binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata` + binary,`SAS `__, :ref:`read_sas` , NA + binary,`SPSS `__, :ref:`read_spss` , NA + binary,`Python Pickle Format `__, :ref:`read_pickle`, :ref:`to_pickle` + SQL,`SQL `__, :ref:`read_sql`,:ref:`to_sql` :ref:`Here ` is an informal performance comparison for some of these IO methods. @@ -74,14 +73,6 @@ sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_tabl delimiters are prone to ignoring quoted data. Regex example: ``'\\r\\t'``. delimiter : str, default ``None`` Alternative argument name for sep. -delim_whitespace : boolean, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) - will be used as the delimiter. Equivalent to setting ``sep='\s+'``. - If this option is set to ``True``, nothing should be passed in for the - ``delimiter`` parameter. - - .. deprecated: 2.2.0 - Use ``sep="\\s+" instead. Column and index locations and names ++++++++++++++++++++++++++++++++++++ @@ -271,34 +262,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default * If ``True`` -> try parsing the index. * If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date - column. - * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. .. note:: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : boolean, default ``False`` - If ``True`` and parse_dates is enabled for a column, attempt to infer the - datetime format to speed up the processing. - - .. deprecated:: 2.0.0 - A strict version of this argument is now the default, passing it has no effect. -keep_date_col : boolean, default ``False`` - If ``True`` and parse_dates specifies combining multiple columns then keep the - original columns. -date_parser : function, default ``None`` - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call date_parser in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays (as - defined by parse_dates) as arguments; 2) concatenate (row-wise) the string - values from the columns defined by parse_dates into a single array and pass - that; and 3) call date_parser once for each row using one or more strings - (corresponding to the columns defined by parse_dates) as arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -830,71 +796,8 @@ The simplest case is to just pass in ``parse_dates=True``: It is often the case that we may want to store date and time data separately, or store various date fields separately. the ``parse_dates`` keyword can be -used to specify a combination of columns to parse the dates and/or times from. - -You can specify a list of column lists to ``parse_dates``, the resulting date -columns will be prepended to the output (so as to not affect the existing column -order) and the new column names will be the concatenation of the component -column names: - -.. ipython:: python - :okwarning: - - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - - with open("tmp.csv", "w") as fh: - fh.write(data) - - df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) - df - -By default the parser removes the component date columns, but you can choose -to retain them via the ``keep_date_col`` keyword: - -.. ipython:: python - :okwarning: - - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True - ) - df - -Note that if you wish to combine multiple columns into a single date column, a -nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that -the second and third columns should each be parsed as separate date columns -while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a -single column. - -You can also use a dict to specify custom name columns: - -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) - df - -It is important to remember that if multiple text columns are to be parsed into -a single date column, then a new column is prepended to the data. The ``index_col`` -specification is based off of this new set of columns rather than the original -data columns: - - -.. ipython:: python - :okwarning: +used to specify columns to parse the dates and/or times. - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=date_spec, index_col=0 - ) # index is the nominal column - df .. note:: If a column or index contains an unparsable date, the entire column or @@ -908,10 +811,6 @@ data columns: for your data to store datetimes in this format, load times will be significantly faster, ~20x has been observed. -.. deprecated:: 2.2.0 - Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime`` - on the relevant result columns instead. - Date parsing functions ++++++++++++++++++++++ @@ -927,12 +826,6 @@ Performance-wise, you should try these methods of parsing dates in order: then use ``to_datetime``. -.. ipython:: python - :suppress: - - os.remove("tmp.csv") - - .. _io.csv.mixed_timezones: Parsing a CSV with mixed timezones @@ -1618,7 +1511,6 @@ Currently, options unsupported by the C and pyarrow engines include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` -* ``sep=None`` with ``delim_whitespace=False`` Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. @@ -1633,14 +1525,12 @@ Options that are unsupported by the pyarrow engine which are not covered by the * ``memory_map`` * ``dialect`` * ``on_bad_lines`` -* ``delim_whitespace`` * ``quoting`` * ``lineterminator`` * ``converters`` * ``decimal`` * ``iterator`` * ``dayfirst`` -* ``infer_datetime_format`` * ``verbose`` * ``skipinitialspace`` * ``low_memory`` @@ -1837,14 +1727,13 @@ with optional parameters: .. csv-table:: :widths: 20, 150 - :delim: ; - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value}; ... ] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. @@ -2025,14 +1914,13 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` .. csv-table:: :widths: 20, 150 - :delim: ; - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value} ...] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if ``False``, then don't infer dtypes at all, default is True, apply only to the data. @@ -3115,7 +3003,7 @@ However, if XPath does not reference node names such as default, ``/*``, then .. note:: Since ``xpath`` identifies the parent of content to be parsed, only immediate - desendants which include child nodes or current attributes are parsed. + descendants which include child nodes or current attributes are parsed. Therefore, ``read_xml`` will not parse the text of grandchildren or other descendants and will not parse attributes of any descendant. To retrieve lower level content, adjust xpath to lower level. For example, @@ -3647,7 +3535,7 @@ For example, to read in a ``MultiIndex`` index without names: df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df -If the index has level names, they will parsed as well, using the same +If the index has level names, they will be parsed as well, using the same parameters. .. ipython:: python @@ -5959,10 +5847,10 @@ You can check if a table exists using :func:`~pandas.io.sql.has_table` Schema support '''''''''''''' -Reading from and writing to different schema's is supported through the ``schema`` +Reading from and writing to different schemas is supported through the ``schema`` keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql` functions. Note however that this depends on the database flavor (sqlite does not -have schema's). For example: +have schemas). For example: .. code-block:: python diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 1edf3908936db..cfd2f40aa93a3 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -484,7 +484,7 @@ either the left or right tables, the values in the joined table will be p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -You can :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of +You can merge :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. Transform the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` before merging @@ -763,7 +763,7 @@ Joining a single Index to a MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level. -The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`. +The ``name`` of the :class:`Index` will match the level name of the :class:`MultiIndex`. .. ipython:: python @@ -974,7 +974,7 @@ with optional filling of missing data with ``fill_method``. :func:`merge_asof` --------------------- -:func:`merge_asof` is similar to an ordered left-join except that mactches are on the +:func:`merge_asof` is similar to an ordered left-join except that matches are on the nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`, the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less than the left's key. Both :class:`DataFrame` must be sorted by the key. @@ -1073,7 +1073,7 @@ compare two :class:`DataFrame` or :class:`Series`, respectively, and summarize t df.compare(df2) By default, if two corresponding values are equal, they will be shown as ``NaN``. -Furthermore, if all values in an entire row / column, the row / column will be +Furthermore, if all values in an entire row / column are equal, that row / column will be omitted from the result. The remaining differences will be aligned on columns. Stack the differences on rows. diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 5149bd30dbbef..4e0245312b827 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -32,7 +32,7 @@ use :class:`api.typing.NaTType`. :class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths), :class:`Float64Dtype` (and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`. These types will maintain the original data type of the data. -For typing applications, use :class:`api.types.NAType`. +For typing applications, use :class:`api.typing.NAType`. .. ipython:: python @@ -319,7 +319,7 @@ Missing values propagate through arithmetic operations between pandas objects. The descriptive statistics and computational methods discussed in the :ref:`data structure overview ` (and listed :ref:`here -` and :ref:`here `) are all +` and :ref:`here `) all account for missing data. When summing data, NA values or empty data will be treated as zero. @@ -337,10 +337,8 @@ When taking the product, NA values or empty data will be treated as 1. pd.Series([], dtype="float64").prod() Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` -ignore NA values by default preserve them in the result. This behavior can be changed -with ``skipna`` - -* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``. +ignore NA values by default, but preserve them in the resulting array. To override +this behaviour and include NA values in the calculation, use ``skipna=False``. .. ipython:: python @@ -355,7 +353,7 @@ with ``skipna`` Dropping missing data ~~~~~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.dropna` dropa rows or columns with missing data. +:meth:`~DataFrame.dropna` drops rows or columns with missing data. .. ipython:: python diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index ce805f98ca528..7757d95c2bccd 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -8,7 +8,7 @@ Options and settings Overview -------- -pandas has an options API configure and customize global behavior related to +pandas has an options API to configure and customize global behavior related to :class:`DataFrame` display, data behavior and more. Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``). diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 43da43a983429..04ba3e5be8ff7 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -211,7 +211,7 @@ "source": [ "## Styler Object and HTML \n", "\n", - "The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their exiting user interface designs.\n", + "The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `
    ` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their existing user interface designs.\n", "\n", "Below we demonstrate the default output, which looks very similar to the standard DataFrame HTML representation. But the HTML here has already attached some CSS classes to each cell, even if we haven't yet created any styles. We can view these by calling the [.to_html()][tohtml] method, which returns the raw HTML as string, which is useful for further processing or adding to a file - read on in [More about CSS and HTML](#More-About-CSS-and-HTML). This section will also provide a walkthrough for how to convert this default output to represent a DataFrame output that is more communicative. For example how we can build `s`:\n", "\n", diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index cf27fc8385223..ad2690ae395be 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -726,57 +726,56 @@ Method summary .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~Series.str.cat`;Concatenate strings - :meth:`~Series.str.split`;Split strings on delimiter - :meth:`~Series.str.rsplit`;Split strings on delimiter working from the end of the string - :meth:`~Series.str.get`;Index into each element (retrieve i-th element) - :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator - :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables - :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence - :meth:`~Series.str.removeprefix`;Remove prefix from string, i.e. only remove if string starts with prefix. - :meth:`~Series.str.removesuffix`;Remove suffix from string, i.e. only remove if string ends with suffix. - :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) - :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" - :meth:`~Series.str.center`;Equivalent to ``str.center`` - :meth:`~Series.str.ljust`;Equivalent to ``str.ljust`` - :meth:`~Series.str.rjust`;Equivalent to ``str.rjust`` - :meth:`~Series.str.zfill`;Equivalent to ``str.zfill`` - :meth:`~Series.str.wrap`;Split long strings into lines with length less than a given width - :meth:`~Series.str.slice`;Slice each string in the Series - :meth:`~Series.str.slice_replace`;Replace slice in each string with passed value - :meth:`~Series.str.count`;Count occurrences of pattern - :meth:`~Series.str.startswith`;Equivalent to ``str.startswith(pat)`` for each element - :meth:`~Series.str.endswith`;Equivalent to ``str.endswith(pat)`` for each element - :meth:`~Series.str.findall`;Compute list of all occurrences of pattern/regex for each string - :meth:`~Series.str.match`;"Call ``re.match`` on each element, returning matched groups as list" - :meth:`~Series.str.extract`;"Call ``re.search`` on each element, returning DataFrame with one row for each element and one column for each regex capture group" - :meth:`~Series.str.extractall`;"Call ``re.findall`` on each element, returning DataFrame with one row for each match and one column for each regex capture group" - :meth:`~Series.str.len`;Compute string lengths - :meth:`~Series.str.strip`;Equivalent to ``str.strip`` - :meth:`~Series.str.rstrip`;Equivalent to ``str.rstrip`` - :meth:`~Series.str.lstrip`;Equivalent to ``str.lstrip`` - :meth:`~Series.str.partition`;Equivalent to ``str.partition`` - :meth:`~Series.str.rpartition`;Equivalent to ``str.rpartition`` - :meth:`~Series.str.lower`;Equivalent to ``str.lower`` - :meth:`~Series.str.casefold`;Equivalent to ``str.casefold`` - :meth:`~Series.str.upper`;Equivalent to ``str.upper`` - :meth:`~Series.str.find`;Equivalent to ``str.find`` - :meth:`~Series.str.rfind`;Equivalent to ``str.rfind`` - :meth:`~Series.str.index`;Equivalent to ``str.index`` - :meth:`~Series.str.rindex`;Equivalent to ``str.rindex`` - :meth:`~Series.str.capitalize`;Equivalent to ``str.capitalize`` - :meth:`~Series.str.swapcase`;Equivalent to ``str.swapcase`` - :meth:`~Series.str.normalize`;Return Unicode normal form. Equivalent to ``unicodedata.normalize`` - :meth:`~Series.str.translate`;Equivalent to ``str.translate`` - :meth:`~Series.str.isalnum`;Equivalent to ``str.isalnum`` - :meth:`~Series.str.isalpha`;Equivalent to ``str.isalpha`` - :meth:`~Series.str.isdigit`;Equivalent to ``str.isdigit`` - :meth:`~Series.str.isspace`;Equivalent to ``str.isspace`` - :meth:`~Series.str.islower`;Equivalent to ``str.islower`` - :meth:`~Series.str.isupper`;Equivalent to ``str.isupper`` - :meth:`~Series.str.istitle`;Equivalent to ``str.istitle`` - :meth:`~Series.str.isnumeric`;Equivalent to ``str.isnumeric`` - :meth:`~Series.str.isdecimal`;Equivalent to ``str.isdecimal`` + + :meth:`~Series.str.cat`,Concatenate strings + :meth:`~Series.str.split`,Split strings on delimiter + :meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string + :meth:`~Series.str.get`,Index into each element (retrieve i-th element) + :meth:`~Series.str.join`,Join strings in each element of the Series with passed separator + :meth:`~Series.str.get_dummies`,Split strings on the delimiter returning DataFrame of dummy variables + :meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex + :meth:`~Series.str.replace`,Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.removeprefix`,Remove prefix from string i.e. only remove if string starts with prefix. + :meth:`~Series.str.removesuffix`,Remove suffix from string i.e. only remove if string ends with suffix. + :meth:`~Series.str.repeat`,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) + :meth:`~Series.str.pad`,Add whitespace to the sides of strings + :meth:`~Series.str.center`,Equivalent to ``str.center`` + :meth:`~Series.str.ljust`,Equivalent to ``str.ljust`` + :meth:`~Series.str.rjust`,Equivalent to ``str.rjust`` + :meth:`~Series.str.zfill`,Equivalent to ``str.zfill`` + :meth:`~Series.str.wrap`,Split long strings into lines with length less than a given width + :meth:`~Series.str.slice`,Slice each string in the Series + :meth:`~Series.str.slice_replace`,Replace slice in each string with passed value + :meth:`~Series.str.count`,Count occurrences of pattern + :meth:`~Series.str.startswith`,Equivalent to ``str.startswith(pat)`` for each element + :meth:`~Series.str.endswith`,Equivalent to ``str.endswith(pat)`` for each element + :meth:`~Series.str.findall`,Compute list of all occurrences of pattern/regex for each string + :meth:`~Series.str.match`,Call ``re.match`` on each element returning matched groups as list + :meth:`~Series.str.extract`,Call ``re.search`` on each element returning DataFrame with one row for each element and one column for each regex capture group + :meth:`~Series.str.extractall`,Call ``re.findall`` on each element returning DataFrame with one row for each match and one column for each regex capture group + :meth:`~Series.str.len`,Compute string lengths + :meth:`~Series.str.strip`,Equivalent to ``str.strip`` + :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip`` + :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip`` + :meth:`~Series.str.partition`,Equivalent to ``str.partition`` + :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition`` + :meth:`~Series.str.lower`,Equivalent to ``str.lower`` + :meth:`~Series.str.casefold`,Equivalent to ``str.casefold`` + :meth:`~Series.str.upper`,Equivalent to ``str.upper`` + :meth:`~Series.str.find`,Equivalent to ``str.find`` + :meth:`~Series.str.rfind`,Equivalent to ``str.rfind`` + :meth:`~Series.str.index`,Equivalent to ``str.index`` + :meth:`~Series.str.rindex`,Equivalent to ``str.rindex`` + :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize`` + :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase`` + :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize`` + :meth:`~Series.str.translate`,Equivalent to ``str.translate`` + :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum`` + :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha`` + :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit`` + :meth:`~Series.str.isspace`,Equivalent to ``str.isspace`` + :meth:`~Series.str.islower`,Equivalent to ``str.islower`` + :meth:`~Series.str.isupper`,Equivalent to ``str.isupper`` + :meth:`~Series.str.istitle`,Equivalent to ``str.istitle`` + :meth:`~Series.str.isnumeric`,Equivalent to ``str.isnumeric`` + :meth:`~Series.str.isdecimal`,Equivalent to ``str.isdecimal`` diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 37413722de96f..0fa36f1e30104 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -326,7 +326,7 @@ which can be specified. These are computed from the starting point specified by .. note:: The ``unit`` parameter does not use the same strings as the ``format`` parameter - that was discussed :ref:`above`). The + that was discussed :ref:`above`. The available units are listed on the documentation for :func:`pandas.to_datetime`. Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp @@ -1273,6 +1273,10 @@ frequencies. We will refer to these aliases as *offset aliases*. are deprecated in favour of the aliases ``h``, ``bh``, ``cbh``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. + Aliases ``Y``, ``M``, and ``Q`` are deprecated in favour of the aliases + ``YE``, ``ME``, ``QE``. + + .. note:: When using the offset aliases above, it should be noted that functions @@ -1475,7 +1479,7 @@ or some other non-observed day. Defined observance rules are: "after_nearest_workday", "apply ``nearest_workday`` and then move to next workday after that day" "sunday_to_monday", "move Sunday to following Monday" "next_monday_or_tuesday", "move Saturday to Monday and Sunday/Monday to Tuesday" - "previous_friday", move Saturday and Sunday to previous Friday" + "previous_friday", "move Saturday and Sunday to previous Friday" "next_monday", "move Saturday and Sunday to following Monday" "weekend_to_monday", "same as ``next_monday``" @@ -1860,7 +1864,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("ME", on="date")[["a"]].sum() + df.resample("MS", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1868,7 +1872,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("ME", level="d")[["a"]].sum() + df.resample("MS", level="d")[["a"]].sum() .. _timeseries.iterating-label: diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index a624e81d17db9..3c5488a47bdf2 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -345,7 +345,6 @@ Float64Index API change .. ipython:: python :okwarning: - s[2:4] s.loc[2:4] s.iloc[2:4] diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 8dafed1efee97..70982e723016f 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -445,7 +445,7 @@ Rolling/expanding moments improvements 3 5 dtype: float64 -- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that +- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (``mean=True``) so that the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index acc5409b86d09..a9003710540d7 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -212,7 +212,7 @@ Other enhancements: 4 True True True True - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on ``Timestamp`` class (:issue:`5351`). -- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. +- Added Google Analytics (``pandas.io.ga``) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64()`` method to the public API (:issue:`8884`). diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c63d047f03823..8ddc8e5d058ca 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1653,7 +1653,7 @@ Timedelta - Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) - Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) - Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- Fixed bug in adding a :class:`DataFrame` with all-``timedelta64[ns]`` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) - Bug in :class:`TimedeltaIndex` where adding a timezone-aware datetime scalar incorrectly returned a timezone-naive :class:`DatetimeIndex` (:issue:`23215`) - Bug in :class:`TimedeltaIndex` where adding ``np.timedelta64('NaT')`` incorrectly returned an all-``NaT`` :class:`DatetimeIndex` instead of an all-``NaT`` :class:`TimedeltaIndex` (:issue:`23215`) - Bug in :class:`Timedelta` and :func:`to_timedelta()` have inconsistencies in supported unit string (:issue:`21762`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5cf5623f73036..50be28a912cf6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1082,7 +1082,7 @@ Numeric - Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`) - Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`) - Bug in ``divmod`` with a :class:`Series` object containing zeros incorrectly raising ``AttributeError`` (:issue:`26987`) -- Inconsistency in :class:`Series` floor-division (`//`) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`) +- Inconsistency in :class:`Series` floor-division (``//``) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`) - Conversion diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index ef0c4d741ca58..c6fda04070240 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -31,7 +31,7 @@ Fixed regressions - Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`) - Fixed regression in setitem with a Series getting aligned before setting the values (:issue:`37427`) - Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`) -- Fixed regression in inplace arithmetic operation (`+=`) on a Series not updating the parent DataFrame/Series (:issue:`36373`) +- Fixed regression in inplace arithmetic operation (``+=``) on a Series not updating the parent DataFrame/Series (:issue:`36373`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bdde8de83d98d..626727a64fea7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -42,8 +42,11 @@ Other enhancements - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) +- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) +- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: @@ -122,6 +125,69 @@ notable_bug_fix2 Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_300.api_breaking.datetime_resolution_inference: + +Datetime resolution inference +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Converting a sequence of strings, ``datetime`` objects, or ``np.datetime64`` objects to +a ``datetime64`` dtype now performs inference on the appropriate resolution (AKA unit) for the output dtype. This affects :class:`Series`, :class:`DataFrame`, :class:`Index`, :class:`DatetimeIndex`, and :func:`to_datetime`. + +Previously, these would always give nanosecond resolution: + +.. code-block:: ipython + + In [1]: dt = pd.Timestamp("2024-03-22 11:36").to_pydatetime() + In [2]: pd.to_datetime([dt]).dtype + Out[2]: dtype(' bool: - return True - - def using_pyarrow_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 8921e1b686303..95c549a8ff0e8 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -157,6 +157,12 @@ def get_option(pat: str) -> Any: ------ OptionError : if no such option exists + See Also + -------- + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the :ref:`User Guide ` @@ -205,6 +211,14 @@ def set_option(*args) -> None: TypeError if keyword arguments are provided OptionError if no such option exists + See Also + -------- + get_option : Retrieve the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + option_context : Context manager to temporarily set options in a ``with`` + statement. + Notes ----- For all available options, please view the :ref:`User Guide ` @@ -265,6 +279,12 @@ def describe_option(pat: str = "", _print_desc: bool = True) -> str | None: str If the description(s) as a string if ``_print_desc=False``. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + Notes ----- For all available options, please view the @@ -309,6 +329,12 @@ def reset_option(pat: str) -> None: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the @@ -400,6 +426,13 @@ def option_context(*args) -> Generator[None, None, None]: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the :ref:`User Guide ` diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index de4b70d387b5f..ccfb2d2ef4a23 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -34,8 +34,3 @@ ctypedef fused numeric_t: ctypedef fused numeric_object_t: numeric_t object - -ctypedef fused uint8_int64_object_t: - uint8_t - int64_t - object diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15f8727c38f8d..d7e485f74e58b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -398,8 +398,14 @@ def group_cumsum( for i in range(N): lab = labels[i] - if lab < 0: + if uses_mask and lab < 0: + # GH#58811 + result_mask[i, :] = True + out[i, :] = 0 + continue + elif lab < 0: continue + for j in range(K): val = values[i, j] diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 811fdd139de2c..8d4c382241d39 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -156,7 +156,7 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) // NaN-floats should be in the same equivalency class, see GH 22119 static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { - return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) || + return (isnan(PyFloat_AS_DOUBLE(a)) && isnan(PyFloat_AS_DOUBLE(b))) || (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b)); } @@ -164,12 +164,12 @@ static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { // PyObject_RichCompareBool for complexobjects has a different behavior // needs to be replaced static inline int complexobject_cmp(PyComplexObject *a, PyComplexObject *b) { - return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) || - (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + return (isnan(a->cval.real) && isnan(b->cval.real) && isnan(a->cval.imag) && + isnan(b->cval.imag)) || + (isnan(a->cval.real) && isnan(b->cval.real) && a->cval.imag == b->cval.imag) || - (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag)) || + (a->cval.real == b->cval.real && isnan(a->cval.imag) && + isnan(b->cval.imag)) || (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag); } @@ -223,7 +223,7 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { static inline Py_hash_t _Pandas_HashDouble(double val) { // Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { + if (isnan(val)) { return 0; } #if PY_VERSION_HEX < 0x030A0000 diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index a37ab45dd57ed..564019d7c0d8c 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -167,6 +167,12 @@ cdef class IntervalMixin: """ Return the midpoint of the Interval. + See Also + -------- + Interval.left : Return the left bound for the interval. + Interval.right : Return the right bound for the interval. + Interval.length : Return the length of the interval. + Examples -------- >>> iv = pd.Interval(0, 5) @@ -285,7 +291,7 @@ cdef class Interval(IntervalMixin): """ Immutable object implementing an Interval, a bounded slice-like interval. - Parameters + Attributes ---------- left : orderable scalar Left bound for the interval. @@ -377,6 +383,12 @@ cdef class Interval(IntervalMixin): """ Left bound for the interval. + See Also + -------- + Interval.right : Return the right bound for the interval. + numpy.ndarray.left : A similar method in numpy for obtaining + the left endpoint(s) of intervals. + Examples -------- >>> interval = pd.Interval(left=1, right=2, closed='left') @@ -390,6 +402,12 @@ cdef class Interval(IntervalMixin): """ Right bound for the interval. + See Also + -------- + Interval.left : Return the left bound for the interval. + numpy.ndarray.right : A similar method in numpy for obtaining + the right endpoint(s) of intervals. + Examples -------- >>> interval = pd.Interval(left=1, right=2, closed='left') @@ -405,6 +423,13 @@ cdef class Interval(IntervalMixin): Either ``left``, ``right``, ``both`` or ``neither``. + See Also + -------- + Interval.closed_left : Check if the interval is closed on the left side. + Interval.closed_right : Check if the interval is closed on the right side. + Interval.open_left : Check if the interval is open on the left side. + Interval.open_right : Check if the interval is open on the right side. + Examples -------- >>> interval = pd.Interval(left=1, right=2, closed='left') diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4fd68a1593e49..b78ff19bcfd53 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -53,6 +53,7 @@ from numpy cimport ( PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + PyArray_SETITEM, complex128_t, flatiter, float64_t, @@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT from pandas._libs cimport util -from pandas._libs.dtypes cimport uint8_int64_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -96,16 +96,12 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) -from pandas._libs.tslibs.conversion cimport ( - _TSObject, - convert_to_tsobject, -) +from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, checknull_with_nat, ) -from pandas._libs.tslibs.np_datetime cimport NPY_FR_ns from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 @@ -184,6 +180,13 @@ def is_scalar(val: object) -> bool: bool Return True if given object is scalar. + See Also + -------- + api.types.is_list_like : Check if the input is list-like. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + api.types.is_bool : Check if the input is a boolean. + Examples -------- >>> import datetime @@ -1442,6 +1445,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: Parameters ---------- value : scalar, list, ndarray, or pandas type + The input data to infer the dtype. skipna : bool, default True Ignore NaN values when inferring the type. @@ -1476,6 +1480,14 @@ def infer_dtype(value: object, skipna: bool = True) -> str: TypeError If ndarray-like but cannot infer the dtype + See Also + -------- + api.types.is_scalar : Check if the input is a scalar. + api.types.is_list_like : Check if the input is list-like. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + api.types.is_bool : Check if the input is a boolean. + Notes ----- - 'mixed' is the catchall for anything that is not otherwise @@ -2481,7 +2493,6 @@ def maybe_convert_objects(ndarray[object] objects, ndarray[uint8_t] mask Seen seen = Seen() object val - _TSObject tsobj float64_t fnan = NaN if dtype_if_all_nat is not None: @@ -2588,8 +2599,7 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.datetime_ = True try: - tsobj = convert_to_tsobject(val, None, None, 0, 0) - tsobj.ensure_reso(NPY_FR_ns) + convert_to_tsobject(val, None, None, 0, 0) except OutOfBoundsDatetime: # e.g. test_out_of_s_bounds_datetime64 seen.object_ = True @@ -2845,14 +2855,16 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value. NoDefault = Literal[_NoDefault.no_default] +@cython.boundscheck(False) +@cython.wraparound(False) def map_infer_mask( - ndarray[object] arr, - object f, - const uint8_t[:] mask, - *, - bint convert=True, - object na_value=no_default, - cnp.dtype dtype=np.dtype(object) + ndarray arr, + object f, + const uint8_t[:] mask, + *, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2875,53 +2887,39 @@ def map_infer_mask( ------- np.ndarray or an ExtensionArray """ - cdef Py_ssize_t n = len(arr) - result = np.empty(n, dtype=dtype) - - _map_infer_mask( - result, - arr, - f, - mask, - na_value, - ) - if convert: - return maybe_convert_objects(result) - else: - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def _map_infer_mask( - ndarray[uint8_int64_object_t] out, - ndarray[object] arr, - object f, - const uint8_t[:] mask, - object na_value=no_default, -) -> None: - """ - Helper for map_infer_mask, split off to use fused types based on the result. - """ cdef: - Py_ssize_t i, n + Py_ssize_t i + Py_ssize_t n = len(arr) object val - n = len(arr) + ndarray result = np.empty(n, dtype=dtype) + + flatiter arr_it = PyArray_IterNew(arr) + flatiter result_it = PyArray_IterNew(result) + for i in range(n): if mask[i]: if na_value is no_default: - val = arr[i] + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) else: val = na_value else: - val = f(arr[i]) + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = f(val) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 val = val.item() - out[i] = val + PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) + + PyArray_ITER_NEXT(arr_it) + PyArray_ITER_NEXT(result_it) + + if convert: + return maybe_convert_objects(result) + else: + return result @cython.boundscheck(False) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index dca3ba0ce49b3..928d253bf3169 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -63,7 +63,10 @@ from pandas._libs.tslibs.conversion cimport ( get_datetime64_nanos, parse_pydatetime, ) -from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + npy_unit_to_abbrev, +) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_nat_strings as nat_strings, @@ -260,7 +263,7 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, - NPY_DATETIMEUNIT creso=NPY_FR_ns, + NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, str unit_for_numerics=None, ): """ @@ -288,8 +291,8 @@ cpdef array_to_datetime( yearfirst parsing behavior when encountering datetime strings utc : bool, default False indicator whether the dates should be UTC - creso : NPY_DATETIMEUNIT, default NPY_FR_ns - Set to NPY_FR_GENERIC to infer a resolution. + creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC + If NPY_FR_GENERIC, conduct inference. unit_for_numerics : str, default "ns" Returns @@ -389,7 +392,7 @@ cpdef array_to_datetime( # GH#32264 np.str_ object val = str(val) - if parse_today_now(val, &iresult[i], utc, creso): + if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" item_reso = NPY_DATETIMEUNIT.NPY_FR_us @@ -533,7 +536,9 @@ def array_to_datetime_with_tz( if state.creso_ever_changed: # We encountered mismatched resolutions, need to re-parse with # the correct one. - return array_to_datetime_with_tz(values, tz=tz, creso=creso) + return array_to_datetime_with_tz( + values, tz=tz, dayfirst=dayfirst, yearfirst=yearfirst, creso=creso + ) elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # i.e. we never encountered anything non-NaT, default to "s". This # ensures that insert and concat-like operations with NaT diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 33f6789f3b402..455bca35d160a 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -12,9 +12,10 @@ cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cdef dict c_OFFSET_TO_PERIOD_FREQSTR -cdef dict c_OFFSET_DEPR_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR +cdef dict c_PERIOD_TO_OFFSET_FREQSTR +cdef dict c_OFFSET_RENAMED_FREQSTR cdef dict c_DEPR_ABBREVS +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 5bfbe211bfd14..479a5a328b1d8 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -176,6 +176,10 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "EOM": "M", "BME": "M", "SME": "M", + "BMS": "M", + "CBME": "M", + "CBMS": "M", + "SMS": "M", "BQS": "Q", "QS": "Q", "BQE": "Q", @@ -228,7 +232,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YE-NOV": "Y-NOV", "W": "W", "ME": "M", - "Y": "Y", "BYE": "Y", "BYE-DEC": "Y-DEC", "BYE-JAN": "Y-JAN", @@ -245,7 +248,7 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YS": "Y", "BYS": "Y", } -cdef dict c_OFFSET_DEPR_FREQSTR = { +cdef dict c_OFFSET_RENAMED_FREQSTR = { "M": "ME", "Q": "QE", "Q-DEC": "QE-DEC", @@ -303,10 +306,37 @@ cdef dict c_OFFSET_DEPR_FREQSTR = { "BQ-OCT": "BQE-OCT", "BQ-NOV": "BQE-NOV", } -cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = { - v: k for k, v in c_OFFSET_DEPR_FREQSTR.items() +PERIOD_TO_OFFSET_FREQSTR = { + "M": "ME", + "Q": "QE", + "Q-DEC": "QE-DEC", + "Q-JAN": "QE-JAN", + "Q-FEB": "QE-FEB", + "Q-MAR": "QE-MAR", + "Q-APR": "QE-APR", + "Q-MAY": "QE-MAY", + "Q-JUN": "QE-JUN", + "Q-JUL": "QE-JUL", + "Q-AUG": "QE-AUG", + "Q-SEP": "QE-SEP", + "Q-OCT": "QE-OCT", + "Q-NOV": "QE-NOV", + "Y": "YE", + "Y-DEC": "YE-DEC", + "Y-JAN": "YE-JAN", + "Y-FEB": "YE-FEB", + "Y-MAR": "YE-MAR", + "Y-APR": "YE-APR", + "Y-MAY": "YE-MAY", + "Y-JUN": "YE-JUN", + "Y-JUL": "YE-JUL", + "Y-AUG": "YE-AUG", + "Y-SEP": "YE-SEP", + "Y-OCT": "YE-OCT", + "Y-NOV": "YE-NOV", } +cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR +cdef dict c_PERIOD_TO_OFFSET_FREQSTR = PERIOD_TO_OFFSET_FREQSTR # Map deprecated resolution abbreviations to correct resolution abbreviations cdef dict c_DEPR_ABBREVS = { @@ -316,6 +346,11 @@ cdef dict c_DEPR_ABBREVS = { "S": "s", } +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { + "w": "W", + "MIN": "min", +} + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index c6cfd44e9f6ab..bc55e34f3d208 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -16,7 +16,7 @@ def get_date_name_field( def get_start_end_field( dtindex: npt.NDArray[np.int64], field: str, - freqstr: str | None = ..., + freq_name: str | None = ..., month_kw: int = ..., reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index ff4fb4d635d17..e523ac2e7b5c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -210,7 +210,7 @@ cdef bint _is_on_month(int month, int compare_month, int modby) noexcept nogil: def get_start_end_field( const int64_t[:] dtindex, str field, - str freqstr=None, + str freq_name=None, int month_kw=12, NPY_DATETIMEUNIT reso=NPY_FR_ns, ): @@ -223,7 +223,7 @@ def get_start_end_field( ---------- dtindex : ndarray[int64] field : str - frestr : str or None, default None + freq_name : str or None, default None month_kw : int, default 12 reso : NPY_DATETIMEUNIT, default NPY_FR_ns @@ -243,20 +243,20 @@ def get_start_end_field( out = np.zeros(count, dtype="int8") - if freqstr: - if freqstr == "C": + if freq_name: + if freq_name == "C": raise ValueError(f"Custom business days is not supported by {field}") - is_business = freqstr[0] == "B" + is_business = freq_name[0] == "B" # YearBegin(), BYearBegin() use month = starting month of year. # QuarterBegin(), BQuarterBegin() use startingMonth = starting # month of year. Other offsets use month, startingMonth as ending # month of year. - if (freqstr[0:2] in ["MS", "QS", "YS"]) or ( - freqstr[1:3] in ["MS", "QS", "YS"]): + if freq_name.lstrip("B")[0:2] in ["QS", "YS"]: end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw + else: end_month = month_kw start_month = (end_month % 12) + 1 diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 7a08e4ad4b260..c483814a3ef74 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -419,6 +419,12 @@ class NaTType(_NaT): Monday == 0 ... Sunday == 6. + See Also + -------- + Timestamp.dayofweek : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isoweekday : Return the day of the week with Monday=1, Sunday=7. + datetime.date.weekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01') @@ -498,6 +504,11 @@ class NaTType(_NaT): ------- str + See Also + -------- + Timestamp.day_of_week : Return day of the week. + Timestamp.day_of_year : Return day of the year. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -523,6 +534,12 @@ class NaTType(_NaT): """ Return a named tuple containing ISO year, week number, and weekday. + See Also + -------- + DatetimeIndex.isocalendar : Return a 3-tuple containing ISO year, + week number, and weekday for the given DatetimeIndex object. + datetime.date.isocalendar : The equivalent method for `datetime.date` objects. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -537,6 +554,14 @@ class NaTType(_NaT): """ Return the daylight saving time (DST) adjustment. + This method returns the DST adjustment as a `datetime.timedelta` object + if the Timestamp is timezone-aware and DST is applicable. + + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2000-06-01 00:00:00', tz='Europe/Brussels') @@ -687,6 +712,12 @@ class NaTType(_NaT): See strftime documentation for more information on the format string: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + See Also + -------- + Timestamp.isoformat : Return the time formatted according to ISO 8601. + pd.to_datetime : Convert argument to datetime. + Period.strftime : Format a single Period. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -764,6 +795,18 @@ class NaTType(_NaT): Return a new Timestamp representing UTC day and time. + See Also + -------- + Timestamp : Constructs an arbitrary datetime. + Timestamp.now : Return the current local date and time, which + can be timezone-aware. + Timestamp.today : Return the current local date and time with + timezone information set to None. + to_datetime : Convert argument to datetime. + date_range : Return a fixed frequency DatetimeIndex. + Timestamp.utctimetuple : Return UTC time tuple, compatible with + time.localtime(). + Examples -------- >>> pd.Timestamp.utcnow() # doctest: +SKIP @@ -791,6 +834,11 @@ class NaTType(_NaT): """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- tz : str, pytz.timezone, dateutil.tz.tzfile or None @@ -806,6 +854,13 @@ class NaTType(_NaT): TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -859,7 +914,30 @@ class NaTType(_NaT): """ Convert a Timestamp object to a native Python datetime object. - If warn=True, issue a warning if nanoseconds is nonzero. + This method is useful for when you need to utilize a pandas Timestamp + object in contexts where native Python datetime objects are expected + or required. The conversion discards the nanoseconds component, and a + warning can be issued in such cases if desired. + + Parameters + ---------- + warn : bool, default True + If True, issues a warning when the timestamp includes nonzero + nanoseconds, as these will be discarded during the conversion. + + Returns + ------- + datetime.datetime or NaT + Returns a datetime.datetime object representing the timestamp, + with year, month, day, hour, minute, second, and microsecond components. + If the timestamp is NaT (Not a Time), returns NaT. + + See Also + -------- + datetime.datetime : The standard Python datetime class that this method + returns. + Timestamp.timestamp : Convert a Timestamp object to POSIX timestamp. + Timestamp.to_datetime64 : Convert a Timestamp object to numpy.datetime64. Examples -------- @@ -924,6 +1002,12 @@ class NaTType(_NaT): """ Round the Timestamp to the specified resolution. + This method rounds the given Timestamp down to a specified frequency + level. It is particularly useful in data analysis to normalize timestamps + to regular frequency intervals. For instance, rounding to the nearest + minute, hour, or day can help in time series comparisons or resampling + operations. + Parameters ---------- freq : str @@ -958,6 +1042,14 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted + See Also + -------- + datetime.round : Similar behavior in native Python datetime module. + Timestamp.floor : Round the Timestamp downward to the nearest multiple + of the specified frequency. + Timestamp.ceil : Round the Timestamp upward to the nearest multiple of + the specified frequency. + Notes ----- If the Timestamp has a timezone, rounding will take place relative to the @@ -1136,6 +1228,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.floor : Round down a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.ceil : Ceil the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, ceiling will take place relative to the @@ -1196,6 +1294,11 @@ timedelta}, default 'raise' """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- tz : str, pytz.timezone, dateutil.tz.tzfile or None @@ -1211,6 +1314,13 @@ timedelta}, default 'raise' TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -1378,6 +1488,14 @@ default 'raise' ------- Timestamp + See Also + -------- + Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.to_pydatetime : Convert Timestamp object to a native + Python datetime object. + to_timedelta : Convert argument into timedelta object, + which can represent differences in times. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 00:00:00.01') diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 107608ec9f606..a24941e4f0a5a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -57,8 +57,10 @@ from pandas._libs.tslibs.ccalendar cimport ( from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( c_DEPR_ABBREVS, - c_OFFSET_DEPR_FREQSTR, - c_REVERSE_OFFSET_DEPR_FREQSTR, + c_OFFSET_RENAMED_FREQSTR, + c_OFFSET_TO_PERIOD_FREQSTR, + c_PERIOD_AND_OFFSET_DEPR_FREQSTR, + c_PERIOD_TO_OFFSET_FREQSTR, periods_per_day, ) from pandas._libs.tslibs.nattype cimport ( @@ -429,6 +431,12 @@ cdef class BaseOffset: """ Return a dict of extra parameters for the offset. + See Also + -------- + tseries.offsets.DateOffset : The base class for all pandas date offsets. + tseries.offsets.WeekOfMonth : Represents the week of the month. + tseries.offsets.LastWeekOfMonth : Represents the last week of the month. + Examples -------- >>> pd.DateOffset(5).kwds @@ -500,6 +508,13 @@ cdef class BaseOffset: """ Return a copy of the frequency. + See Also + -------- + tseries.offsets.Week.copy : Return a copy of Week offset. + tseries.offsets.DateOffset.copy : Return a copy of date offset. + tseries.offsets.MonthEnd.copy : Return a copy of MonthEnd offset. + tseries.offsets.YearBegin.copy : Return a copy of YearBegin offset. + Examples -------- >>> freq = pd.DateOffset(1) @@ -551,6 +566,14 @@ cdef class BaseOffset: """ Return a string representing the base frequency. + See Also + -------- + tseries.offsets.Week : Represents a weekly offset. + DateOffset : Base class for all other offset classes. + tseries.offsets.Day : Represents a single day offset. + tseries.offsets.MonthEnd : Represents a monthly offset that + snaps to the end of the month. + Examples -------- >>> pd.offsets.Hour().name @@ -1083,7 +1106,7 @@ cdef class Day(Tick): """ Offset ``n`` days. - Parameters + Attributes ---------- n : int, default 1 The number of days represented. @@ -2517,7 +2540,7 @@ cdef class BYearBegin(YearOffset): """ DateOffset increments between the first business day of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2558,7 +2581,7 @@ cdef class YearEnd(YearOffset): YearEnd goes to the next date which is the end of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2612,7 +2635,7 @@ cdef class YearBegin(YearOffset): YearBegin goes to the next date which is the start of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2735,7 +2758,7 @@ cdef class BQuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2817,7 +2840,7 @@ cdef class QuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2918,7 +2941,7 @@ cdef class MonthEnd(MonthOffset): MonthEnd goes to the next date which is an end of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -2993,7 +3016,7 @@ cdef class BusinessMonthEnd(MonthOffset): BusinessMonthEnd goes to the next date which is the last business day of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3031,7 +3054,7 @@ cdef class BusinessMonthBegin(MonthOffset): BusinessMonthBegin goes to the next date which is the first business day of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3201,7 +3224,7 @@ cdef class SemiMonthEnd(SemiMonthOffset): """ Two DateOffset's per month repeating on the last day of the month & day_of_month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -4690,6 +4713,34 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} +def _validate_to_offset_alias(alias: str, is_period: bool) -> None: + if not is_period: + if alias.upper() in c_OFFSET_RENAMED_FREQSTR: + raise ValueError( + f"\'{alias}\' is no longer supported for offsets. Please " + f"use \'{c_OFFSET_RENAMED_FREQSTR.get(alias.upper())}\' " + f"instead." + ) + if (alias.upper() != alias and + alias.lower() not in {"s", "ms", "us", "ns"} and + alias.upper().split("-")[0].endswith(("S", "E"))): + raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) + if (is_period and + alias.upper() in c_OFFSET_TO_PERIOD_FREQSTR and + alias != "ms" and + alias.upper().split("-")[0].endswith(("S", "E"))): + if (alias.upper().startswith("B") or + alias.upper().startswith("S") or + alias.upper().startswith("C")): + raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) + else: + alias_msg = "".join(alias.upper().split("E", 1)) + raise ValueError( + f"for Period, please use \'{alias_msg}\' " + f"instead of \'{alias}\'" + ) + + # TODO: better name? def _get_offset(name: str) -> BaseOffset: """ @@ -4829,54 +4880,26 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if not is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR[name.upper()] - if (not is_period and - name != name.upper() and - name.lower() not in {"s", "ms", "us", "ns"} and - name.upper().split("-")[0].endswith(("S", "E"))): - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{name.upper()}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = name.upper() - if is_period and name.upper() in c_REVERSE_OFFSET_DEPR_FREQSTR: - if name.upper().startswith("Y"): - raise ValueError( - f"for Period, please use \'Y{name.upper()[2:]}\' " - f"instead of \'{name}\'" - ) - if (name.upper().startswith("B") or - name.upper().startswith("S") or - name.upper().startswith("C")): - raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) - else: - raise ValueError( - f"for Period, please use " - f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name.upper())}\' " - f"instead of \'{name}\'" - ) - elif is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: - if name.upper() != name: + _validate_to_offset_alias(name, is_period) + if is_period: + if name.upper() in c_PERIOD_TO_OFFSET_FREQSTR: + if name.upper() != name: + raise ValueError( + f"\'{name}\' is no longer supported, " + f"please use \'{name.upper()}\' instead.", + ) + name = c_PERIOD_TO_OFFSET_FREQSTR.get(name.upper()) + + if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: warnings.warn( - f"\'{name}\' is deprecated and will be removed in " - f"a future version, please use \'{name.upper()}\' " - f"instead.", + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f" instead.", FutureWarning, stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR.get(name.upper()) - + ) + name = c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = _lite_rule_alias.get(name) or name diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index 40394f915d4b0..845bd9a5a5635 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -27,7 +27,4 @@ def guess_datetime_format( dt_str: str, dayfirst: bool | None = ..., ) -> str | None: ... -def concat_date_cols( - date_cols: tuple, -) -> npt.NDArray[np.object_]: ... def get_rule_month(source: str) -> str: ... diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 85ef3fd93ff09..35d2433a707a0 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -7,7 +7,6 @@ import warnings from pandas.util._exceptions import find_stack_level -cimport cython from cpython.datetime cimport ( datetime, datetime_new, @@ -18,7 +17,6 @@ from cpython.datetime cimport ( from datetime import timezone -from cpython.object cimport PyObject_Str from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cython cimport Py_ssize_t from libc.string cimport strchr @@ -28,15 +26,7 @@ import_datetime() import numpy as np cimport numpy as cnp -from numpy cimport ( - PyArray_GETITEM, - PyArray_ITER_DATA, - PyArray_ITER_NEXT, - PyArray_IterNew, - flatiter, - float64_t, - int64_t, -) +from numpy cimport int64_t cnp.import_array() @@ -75,8 +65,6 @@ import_pandas_datetime() from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs.util cimport is_array - cdef extern from "pandas/portable.h": int getdigit_ascii(char c, int default) nogil @@ -871,6 +859,10 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: """ Guess the datetime format of a given datetime string. + This function attempts to deduce the format of a given datetime string. It is + useful for situations where the datetime format is unknown and needs to be + determined for proper parsing. The function is not guaranteed to return a format. + Parameters ---------- dt_str : str @@ -888,6 +880,12 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: datetime format string (for `strftime` or `strptime`), or None if it can't be guessed. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp : Pandas replacement for python datetime.datetime object. + DatetimeIndex : Immutable ndarray-like of datetime64 data. + Examples -------- >>> from pandas.tseries.api import guess_datetime_format @@ -1097,115 +1095,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept: ) -@cython.wraparound(False) -@cython.boundscheck(False) -cdef object convert_to_unicode(object item, bint keep_trivial_numbers): - """ - Convert `item` to str. - - Parameters - ---------- - item : object - keep_trivial_numbers : bool - if True, then conversion (to string from integer/float zero) - is not performed - - Returns - ------- - str or int or float - """ - cdef: - float64_t float_item - - if keep_trivial_numbers: - if isinstance(item, int): - if item == 0: - return item - elif isinstance(item, float): - float_item = item - if float_item == 0.0 or float_item != float_item: - return item - - if not isinstance(item, str): - item = PyObject_Str(item) - - return item - - -@cython.wraparound(False) -@cython.boundscheck(False) -def concat_date_cols(tuple date_cols) -> np.ndarray: - """ - Concatenates elements from numpy arrays in `date_cols` into strings. - - Parameters - ---------- - date_cols : tuple[ndarray] - - Returns - ------- - arr_of_rows : ndarray[object] - - Examples - -------- - >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) - >>> times=np.array(['11:20', '10:45'], dtype=object) - >>> result = concat_date_cols((dates, times)) - >>> result - array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) - """ - cdef: - Py_ssize_t rows_count = 0, col_count = len(date_cols) - Py_ssize_t col_idx, row_idx - list list_to_join - cnp.ndarray[object] iters - object[::1] iters_view - flatiter it - cnp.ndarray[object] result - object[::1] result_view - - if col_count == 0: - return np.zeros(0, dtype=object) - - if not all(is_array(array) for array in date_cols): - raise ValueError("not all elements from date_cols are numpy arrays") - - rows_count = min(len(array) for array in date_cols) - result = np.zeros(rows_count, dtype=object) - result_view = result - - if col_count == 1: - array = date_cols[0] - it = PyArray_IterNew(array) - for row_idx in range(rows_count): - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - result_view[row_idx] = convert_to_unicode(item, True) - PyArray_ITER_NEXT(it) - else: - # create fixed size list - more efficient memory allocation - list_to_join = [None] * col_count - iters = np.zeros(col_count, dtype=object) - - # create memoryview of iters ndarray, that will contain some - # flatiter's for each array in `date_cols` - more efficient indexing - iters_view = iters - for col_idx, array in enumerate(date_cols): - iters_view[col_idx] = PyArray_IterNew(array) - - # array elements that are on the same line are converted to one string - for row_idx in range(rows_count): - for col_idx, array in enumerate(date_cols): - # this cast is needed, because we did not find a way - # to efficiently store `flatiter` type objects in ndarray - it = iters_view[col_idx] - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - list_to_join[col_idx] = convert_to_unicode(item, False) - PyArray_ITER_NEXT(it) - result_view[row_idx] = " ".join(list_to_join) - - return result - - cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December. diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 838b5b9f4595f..023a0f52e320f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2329,6 +2329,12 @@ cdef class _Period(PeriodMixin): """ Return the quarter this Period falls on. + See Also + -------- + Timestamp.quarter : Return the quarter of the Timestamp. + Period.year : Return the year of the period. + Period.month : Return the month of the period. + Examples -------- >>> period = pd.Period('2022-04', 'M') @@ -2443,6 +2449,12 @@ cdef class _Period(PeriodMixin): """ Return True if the period's year is in a leap year. + See Also + -------- + Timestamp.is_leap_year : Check if the year in a Timestamp is a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. + Examples -------- >>> period = pd.Period('2022-01', 'M') @@ -2693,6 +2705,12 @@ class Period(_Period): second : int, default 0 Second value of the period. + See Also + -------- + Timestamp : Pandas replacement for python datetime.datetime object. + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Generates a fixed frequency range of timedeltas. + Examples -------- >>> period = pd.Period('2012-1-1', freq='D') diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d6c3285d89c59..43279051e2a30 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -354,7 +354,7 @@ def array_strptime( bint exact=True, errors="raise", bint utc=False, - NPY_DATETIMEUNIT creso=NPY_FR_ns, + NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, ): """ Calculates the datetime structs represented by the passed array of strings @@ -365,7 +365,7 @@ def array_strptime( fmt : string-like regex exact : matches must be exact if True, search if False errors : string specifying error handling, {'raise', 'coerce'} - creso : NPY_DATETIMEUNIT, default NPY_FR_ns + creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC Set to NPY_FR_GENERIC to infer a resolution. """ @@ -712,7 +712,7 @@ cdef tzinfo _parse_with_format( elif len(s) <= 6: item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us else: - item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns + item_reso[0] = NPY_FR_ns # Pad to always return nanoseconds s += "0" * (9 - len(s)) us = int(s) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 9078fd4116899..dc4aed9920734 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1731,6 +1731,12 @@ cdef class _Timedelta(timedelta): ------- Timedelta + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + to_timedelta : Convert argument to timedelta. + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -1785,6 +1791,7 @@ class Timedelta(_Timedelta): Parameters ---------- value : Timedelta, timedelta, np.timedelta64, str, or int + Input value. unit : str, default 'ns' Denote the unit of the input, if input is an integer. @@ -1810,6 +1817,15 @@ class Timedelta(_Timedelta): Values for construction in compat with datetime.timedelta. Numpy ints and floats will be coerced to python ints and floats. + See Also + -------- + Timestamp : Represents a single timestamp in time. + TimedeltaIndex : Immutable Index of timedelta64 data. + DateOffset : Standard kind of date increment used for a date range. + to_timedelta : Convert argument to timedelta. + datetime.timedelta : Represents a duration in the datetime module. + numpy.timedelta64 : Represents a duration compatible with NumPy. + Notes ----- The constructor may take in either both values of value and unit or diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 0352173aa3e2d..9130c80c66e81 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -579,15 +579,15 @@ cdef class _Timestamp(ABCTimestamp): if freq: kwds = freq.kwds month_kw = kwds.get("startingMonth", kwds.get("month", 12)) - freqstr = freq.freqstr + freq_name = freq.name else: month_kw = 12 - freqstr = None + freq_name = None val = self._maybe_convert_value_to_local() out = get_start_end_field(np.array([val], dtype=np.int64), - field, freqstr, month_kw, self._creso) + field, freq_name, month_kw, self._creso) return out[0] @property @@ -771,6 +771,11 @@ cdef class _Timestamp(ABCTimestamp): ------- str + See Also + -------- + Timestamp.day_of_week : Return day of the week. + Timestamp.day_of_year : Return day of the year. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -815,9 +820,20 @@ cdef class _Timestamp(ABCTimestamp): """ Return True if year is a leap year. + A leap year is a year, which has 366 days (instead of 365) including 29th of + February as an intercalary day. Leap years are years which are multiples of + four with the exception of years divisible by 100 but not by 400. + Returns ------- bool + True if year is a leap year, else False + + See Also + -------- + Period.is_leap_year : Return True if the period’s year is in a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. Examples -------- @@ -836,6 +852,12 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.isoweekday : Return the ISO day of the week represented by the date. + Timestamp.weekday : Return the day of the week represented by the date. + Timestamp.day_of_year : Return day of the year. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) @@ -853,6 +875,10 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.day_of_week : Return day of the week. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) @@ -887,6 +913,11 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.weekday : Return the day of the week. + Timestamp.quarter : Return the quarter of the year. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) @@ -904,6 +935,11 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.month_name : Return the month name of the Timestamp with + specified locale. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) @@ -988,6 +1024,12 @@ cdef class _Timestamp(ABCTimestamp): ------- str + See Also + -------- + Timestamp.strftime : Return a formatted string. + Timestamp.isocalendar : Return a tuple containing ISO year, week number and + weekday. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -1095,6 +1137,14 @@ cdef class _Timestamp(ABCTimestamp): ------- Timestamp + See Also + -------- + Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.to_pydatetime : Convert Timestamp object to a native + Python datetime object. + to_timedelta : Convert argument into timedelta object, + which can represent differences in times. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 00:00:00.01') @@ -1122,6 +1172,12 @@ cdef class _Timestamp(ABCTimestamp): """ Return numpy datetime64 format in nanoseconds. + See Also + -------- + numpy.datetime64 : Numpy datatype for dates and times with high precision. + Timestamp.to_numpy : Convert the Timestamp to a NumPy datetime64. + to_datetime : Convert argument to datetime. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14, 15) @@ -1151,7 +1207,30 @@ cdef class _Timestamp(ABCTimestamp): """ Convert a Timestamp object to a native Python datetime object. - If warn=True, issue a warning if nanoseconds is nonzero. + This method is useful for when you need to utilize a pandas Timestamp + object in contexts where native Python datetime objects are expected + or required. The conversion discards the nanoseconds component, and a + warning can be issued in such cases if desired. + + Parameters + ---------- + warn : bool, default True + If True, issues a warning when the timestamp includes nonzero + nanoseconds, as these will be discarded during the conversion. + + Returns + ------- + datetime.datetime or NaT + Returns a datetime.datetime object representing the timestamp, + with year, month, day, hour, minute, second, and microsecond components. + If the timestamp is NaT (Not a Time), returns NaT. + + See Also + -------- + datetime.datetime : The standard Python datetime class that this method + returns. + Timestamp.timestamp : Convert a Timestamp object to POSIX timestamp. + Timestamp.to_datetime64 : Convert a Timestamp object to numpy.datetime64. Examples -------- @@ -1277,15 +1356,29 @@ class Timestamp(_Timestamp): ---------- ts_input : datetime-like, str, int, float Value to be converted to Timestamp. - year, month, day : int - hour, minute, second, microsecond : int, optional, default 0 + year : int + Value of year. + month : int + Value of month. + day : int + Value of day. + hour : int, optional, default 0 + Value of hour. + minute : int, optional, default 0 + Value of minute. + second : int, optional, default 0 + Value of second. + microsecond : int, optional, default 0 + Value of microsecond. tzinfo : datetime.tzinfo, optional, default None + Timezone info. nanosecond : int, optional, default 0 + Value of nanosecond. tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : str Unit used for conversion if ts_input is of type int or float. The - valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For + valid values are 'W', 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For example, 's' means seconds and 'ms' means milliseconds. For float inputs, the result will be stored in nanoseconds, and @@ -1296,6 +1389,11 @@ class Timestamp(_Timestamp): datetime-like corresponds to the first (0) or the second time (1) the wall clock hits the ambiguous time. + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + datetime.datetime : Python datetime.datetime object. + Notes ----- There are essentially three calling conventions for the constructor. The @@ -1319,6 +1417,11 @@ class Timestamp(_Timestamp): >>> pd.Timestamp(1513393355.5, unit='s') Timestamp('2017-12-16 03:02:35.500000') + This converts an int representing a Unix-epoch in units of weeks + + >>> pd.Timestamp(1535, unit='W') + Timestamp('1999-06-03 00:00:00') + This converts an int representing a Unix-epoch in units of seconds and for a particular timezone @@ -1413,6 +1516,18 @@ class Timestamp(_Timestamp): Return a new Timestamp representing UTC day and time. + See Also + -------- + Timestamp : Constructs an arbitrary datetime. + Timestamp.now : Return the current local date and time, which + can be timezone-aware. + Timestamp.today : Return the current local date and time with + timezone information set to None. + to_datetime : Convert argument to datetime. + date_range : Return a fixed frequency DatetimeIndex. + Timestamp.utctimetuple : Return UTC time tuple, compatible with + time.localtime(). + Examples -------- >>> pd.Timestamp.utcnow() # doctest: +SKIP @@ -1484,6 +1599,12 @@ class Timestamp(_Timestamp): See strftime documentation for more information on the format string: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + See Also + -------- + Timestamp.isoformat : Return the time formatted according to ISO 8601. + pd.to_datetime : Convert argument to datetime. + Period.strftime : Format a single Period. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -1553,6 +1674,14 @@ class Timestamp(_Timestamp): """ Return the daylight saving time (DST) adjustment. + This method returns the DST adjustment as a `datetime.timedelta` object + if the Timestamp is timezone-aware and DST is applicable. + + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2000-06-01 00:00:00', tz='Europe/Brussels') @@ -1567,6 +1696,12 @@ class Timestamp(_Timestamp): """ Return a named tuple containing ISO year, week number, and weekday. + See Also + -------- + DatetimeIndex.isocalendar : Return a 3-tuple containing ISO year, + week number, and weekday for the given DatetimeIndex object. + datetime.date.isocalendar : The equivalent method for `datetime.date` objects. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -1933,6 +2068,12 @@ class Timestamp(_Timestamp): """ Round the Timestamp to the specified resolution. + This method rounds the given Timestamp down to a specified frequency + level. It is particularly useful in data analysis to normalize timestamps + to regular frequency intervals. For instance, rounding to the nearest + minute, hour, or day can help in time series comparisons or resampling + operations. + Parameters ---------- freq : str @@ -1967,6 +2108,14 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted + See Also + -------- + datetime.round : Similar behavior in native Python datetime module. + Timestamp.floor : Round the Timestamp downward to the nearest multiple + of the specified frequency. + Timestamp.ceil : Round the Timestamp upward to the nearest multiple of + the specified frequency. + Notes ----- If the Timestamp has a timezone, rounding will take place relative to the @@ -2147,6 +2296,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.floor : Round down a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.ceil : Ceil the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, ceiling will take place relative to the @@ -2332,6 +2487,11 @@ default 'raise' """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- tz : str, pytz.timezone, dateutil.tz.tzfile or None @@ -2347,6 +2507,13 @@ default 'raise' TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -2606,6 +2773,12 @@ default 'raise' Monday == 0 ... Sunday == 6. + See Also + -------- + Timestamp.dayofweek : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isoweekday : Return the day of the week with Monday=1, Sunday=7. + datetime.date.weekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01') diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 12395b42bba19..85d03ea17bf42 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -10,7 +10,6 @@ ContextManager, cast, ) -import warnings import numpy as np @@ -58,7 +57,6 @@ assert_indexing_slices_equivalent, assert_interval_array_equal, assert_is_sorted, - assert_is_valid_plot_return_object, assert_metadata_equivalent, assert_numpy_array_equal, assert_period_array_equal, @@ -290,17 +288,11 @@ def box_expected(expected, box_cls, transpose: bool = True): else: expected = pd.array(expected, copy=False) elif box_cls is Index: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Index(expected) + expected = Index(expected) elif box_cls is Series: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected) + expected = Series(expected) elif box_cls is DataFrame: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected).to_frame() + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame @@ -538,8 +530,8 @@ def shares_memory(left, right) -> bool: left._mask, right._mask ) - if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1: - arr = left._mgr.arrays[0] + if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1: + arr = left._mgr.blocks[0].values return shares_memory(arr, right) raise NotImplementedError(type(left), type(right)) @@ -565,7 +557,6 @@ def shares_memory(left, right) -> bool: "assert_indexing_slices_equivalent", "assert_interval_array_equal", "assert_is_sorted", - "assert_is_valid_plot_return_object", "assert_metadata_equivalent", "assert_numpy_array_equal", "assert_period_array_equal", diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 543d7944e4c5d..1127a4512643c 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -196,7 +196,9 @@ def assert_index_equal( Parameters ---------- left : Index + The first index to compare. right : Index + The second index to compare. exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for @@ -219,6 +221,11 @@ def assert_index_equal( Specify object name being compared, internally used to show appropriate assertion message. + See Also + -------- + testing.assert_series_equal : Check that two Series are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm @@ -422,28 +429,6 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None: return None -def assert_is_valid_plot_return_object(objs) -> None: - from matplotlib.artist import Artist - from matplotlib.axes import Axes - - if isinstance(objs, (Series, np.ndarray)): - if isinstance(objs, Series): - objs = objs._values - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, " - f"type encountered {type(el).__name__!r}" - ) - assert isinstance(el, (Axes, dict)), msg - else: - msg = ( - "objs is neither an ndarray of Artist instances nor a single " - "ArtistArtist instance, tuple, or dict, 'objs' is a " - f"{type(objs).__name__!r}" - ) - assert isinstance(objs, (Artist, tuple, dict)), msg - - def assert_is_sorted(seq) -> None: """Assert that the sequence is sorted.""" if isinstance(seq, (Index, Series)): @@ -850,7 +835,9 @@ def assert_series_equal( Parameters ---------- left : Series + First Series to compare. right : Series + Second Series to compare. check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' @@ -901,6 +888,11 @@ def assert_series_equal( .. versionadded:: 1.5.0 + See Also + -------- + testing.assert_index_equal : Check that two Indexes are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index 7ebed8857f0af..91b5d2a981bef 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -11,8 +11,6 @@ ) import uuid -from pandas._config import using_copy_on_write - from pandas.compat import PYPY from pandas.errors import ChainedAssignmentError @@ -158,34 +156,25 @@ def with_csv_dialect(name: str, **kwargs) -> Generator[None, None, None]: csv.unregister_dialect(name) -def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=()): +def raises_chained_assignment_error(extra_warnings=(), extra_match=()): from pandas._testing import assert_produces_warning - if not warn: - from contextlib import nullcontext - - return nullcontext() - - if PYPY and not extra_warnings: - from contextlib import nullcontext + if PYPY: + if not extra_warnings: + from contextlib import nullcontext - return nullcontext() - elif PYPY and extra_warnings: - return assert_produces_warning( - extra_warnings, - match=extra_match, - ) - else: - if using_copy_on_write(): - warning = ChainedAssignmentError - match = ( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment" - ) + return nullcontext() else: - warning = FutureWarning # type: ignore[assignment] - # TODO update match - match = "ChainedAssignmentError" + return assert_produces_warning( + extra_warnings, + match=extra_match, + ) + else: + warning = ChainedAssignmentError + match = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment" + ) if extra_warnings: warning = (warning, *extra_warnings) # type: ignore[assignment] return assert_produces_warning( diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index caa00b205a29c..4583e7edebbdc 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -22,6 +22,7 @@ PY311, PY312, PYPY, + WASM, ) import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev @@ -122,6 +123,18 @@ def is_platform_power() -> bool: return platform.machine() in ("ppc64", "ppc64le") +def is_platform_riscv64() -> bool: + """ + Checking if the running platform use riscv64 architecture. + + Returns + ------- + bool + True if the running platform uses riscv64 architecture. + """ + return platform.machine() == "riscv64" + + def is_ci_environment() -> bool: """ Checking if running in a continuous integration environment by checking @@ -195,4 +208,5 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "PY311", "PY312", "PYPY", + "WASM", ] diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 7bc3fbaaefebf..2625389e5254a 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -17,6 +17,7 @@ PY311 = sys.version_info >= (3, 11) PY312 = sys.version_info >= (3, 12) PYPY = platform.python_implementation() == "PyPy" +WASM = (sys.platform == "emscripten") or (platform.machine() in ["wasm32", "wasm64"]) ISMUSL = "musl" in (sysconfig.get_config_var("HOST_GNU_TYPE") or "") REF_COUNT = 2 if PY311 else 3 @@ -27,4 +28,5 @@ "PY311", "PY312", "PYPY", + "WASM", ] diff --git a/pandas/conftest.py b/pandas/conftest.py index 21100178262c8..163c3890a7f6d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -150,6 +150,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), + ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"), ("NDFrame.replace", "Series.replace without 'value'"), ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), @@ -158,6 +159,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"), + ("NDFrame.reindex_like", "keyword argument 'method' is deprecated"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), ( @@ -672,47 +674,47 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": Index([f"pandas_{i}" for i in range(100)]), - "datetime": date_range("2020-01-01", periods=100), - "datetime-tz": date_range("2020-01-01", periods=100, tz="US/Pacific"), - "period": period_range("2020-01-01", periods=100, freq="D"), - "timedelta": timedelta_range(start="1 day", periods=100, freq="D"), - "range": RangeIndex(100), - "int8": Index(np.arange(100), dtype="int8"), - "int16": Index(np.arange(100), dtype="int16"), - "int32": Index(np.arange(100), dtype="int32"), - "int64": Index(np.arange(100), dtype="int64"), - "uint8": Index(np.arange(100), dtype="uint8"), - "uint16": Index(np.arange(100), dtype="uint16"), - "uint32": Index(np.arange(100), dtype="uint32"), - "uint64": Index(np.arange(100), dtype="uint64"), - "float32": Index(np.arange(100), dtype="float32"), - "float64": Index(np.arange(100), dtype="float64"), + "string": Index([f"pandas_{i}" for i in range(10)]), + "datetime": date_range("2020-01-01", periods=10), + "datetime-tz": date_range("2020-01-01", periods=10, tz="US/Pacific"), + "period": period_range("2020-01-01", periods=10, freq="D"), + "timedelta": timedelta_range(start="1 day", periods=10, freq="D"), + "range": RangeIndex(10), + "int8": Index(np.arange(10), dtype="int8"), + "int16": Index(np.arange(10), dtype="int16"), + "int32": Index(np.arange(10), dtype="int32"), + "int64": Index(np.arange(10), dtype="int64"), + "uint8": Index(np.arange(10), dtype="uint8"), + "uint16": Index(np.arange(10), dtype="uint16"), + "uint32": Index(np.arange(10), dtype="uint32"), + "uint64": Index(np.arange(10), dtype="uint64"), + "float32": Index(np.arange(10), dtype="float32"), + "float64": Index(np.arange(10), dtype="float64"), "bool-object": Index([True, False] * 5, dtype=object), "bool-dtype": Index([True, False] * 5, dtype=bool), "complex64": Index( - np.arange(100, dtype="complex64") + 1.0j * np.arange(100, dtype="complex64") + np.arange(10, dtype="complex64") + 1.0j * np.arange(10, dtype="complex64") ), "complex128": Index( - np.arange(100, dtype="complex128") + 1.0j * np.arange(100, dtype="complex128") + np.arange(10, dtype="complex128") + 1.0j * np.arange(10, dtype="complex128") ), - "categorical": CategoricalIndex(list("abcd") * 25), - "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=101)), + "categorical": CategoricalIndex(list("abcd") * 2), + "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=11)), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), - "nullable_int": Index(np.arange(100), dtype="Int64"), - "nullable_uint": Index(np.arange(100), dtype="UInt16"), - "nullable_float": Index(np.arange(100), dtype="Float32"), - "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), + "nullable_int": Index(np.arange(10), dtype="Int64"), + "nullable_uint": Index(np.arange(10), dtype="UInt16"), + "nullable_float": Index(np.arange(10), dtype="Float32"), + "nullable_bool": Index(np.arange(10).astype(bool), dtype="boolean"), "string-python": Index( - pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]") + pd.array([f"pandas_{i}" for i in range(10)], dtype="string[python]") ), } if has_pyarrow: - idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) + idx = Index(pd.array([f"pandas_{i}" for i in range(10)], dtype="string[pyarrow]")) indices_dict["string-pyarrow"] = idx diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 0a26acb7df60a..82fd4e34ac67b 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -14,6 +14,8 @@ from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import jit_user_function + @functools.cache def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): @@ -21,10 +23,10 @@ def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): import numba else: numba = import_optional_dependency("numba") - nb_compat_func = numba.extending.register_jitable(func) + nb_compat_func = jit_user_function(func) @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def nb_looper(values, axis): + def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape if axis == 0: @@ -33,7 +35,7 @@ def nb_looper(values, axis): else: first_elem = values[0] dim0 = values.shape[0] - res0 = nb_compat_func(first_elem) + res0 = nb_compat_func(first_elem, *args) # Use np.asarray to get shape for # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape @@ -44,11 +46,11 @@ def nb_looper(values, axis): if axis == 1: buff[0] = res0 for i in numba.prange(1, values.shape[0]): - buff[i] = nb_compat_func(values[i]) + buff[i] = nb_compat_func(values[i], *args) else: buff[:, 0] = res0 for j in numba.prange(1, values.shape[1]): - buff[:, j] = nb_compat_func(values[:, j]) + buff[:, j] = nb_compat_func(values[:, j], *args) return buff return nb_looper diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 39c471c3db440..3acbfc3eabbac 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -195,17 +195,11 @@ def add_delegate_accessors(cls): return add_delegate_accessors -# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE -# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py -# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors -# 2. We use a UserWarning instead of a custom Warning - - -class CachedAccessor: +class Accessor: """ Custom property-like object. - A descriptor for caching accessors. + A descriptor for accessors. Parameters ---------- @@ -229,13 +223,12 @@ def __get__(self, obj, cls): if obj is None: # we're accessing the attribute of the class, i.e., Dataset.geo return self._accessor - accessor_obj = self._accessor(obj) - # Replace the property with the accessor object. Inspired by: - # https://www.pydanny.com/cached-property.html - # We need to use object.__setattr__ because we overwrite __setattr__ on - # NDFrame - object.__setattr__(obj, self._name, accessor_obj) - return accessor_obj + return self._accessor(obj) + + +# Alias kept for downstream libraries +# TODO: Deprecate as name is now misleading +CachedAccessor = Accessor @doc(klass="", examples="", others="") @@ -295,7 +288,7 @@ def decorator(accessor: TypeT) -> TypeT: UserWarning, stacklevel=find_stack_level(), ) - setattr(cls, name, CachedAccessor(name, accessor)) + setattr(cls, name, Accessor(name, accessor)) cls._accessors.add(name) return accessor diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 33beef23197bd..0d97f8a298fdb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -319,6 +319,8 @@ def unique(values): Parameters ---------- values : 1d array-like + The input array-like object containing values from which to extract + unique values. Returns ------- @@ -346,14 +348,15 @@ def unique(values): array([2, 1]) >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + array(['2016-01-01T00:00:00'], dtype='datetime64[s]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) @@ -365,7 +368,8 @@ def unique(values): ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 832beeddcef3c..75ad17b59bf88 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -51,6 +51,10 @@ from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.util.numba_ import ( + get_jit_arguments, + prepare_function_arguments, +) if TYPE_CHECKING: from collections.abc import ( @@ -70,7 +74,6 @@ from pandas.core.resample import Resampler from pandas.core.window.rolling import BaseWindow - ResType = dict[int, Any] @@ -471,8 +474,30 @@ def compute_dict_like( keys += [key] * len(key_data) results += key_data - else: + elif is_groupby: # key used for column selection and output + + df = selected_obj + results, keys = [], [] + for key, how in func.items(): + cols = df[key] + + if cols.ndim == 1: + series_list = [obj._gotitem(key, ndim=1, subset=cols)] + else: + series_list = [] + for index in range(cols.shape[1]): + col = cols.iloc[:, index] + + series = obj._gotitem(key, ndim=1, subset=col) + series_list.append(series) + + for series in series_list: + result = getattr(series, op_name)(how, **kwargs) + results.append(result) + keys.append(key) + + else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) for key, how in func.items() @@ -496,11 +521,14 @@ def wrap_results_dict_like( is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] if all(is_ndframe): - results = dict(zip(result_index, result_data)) + results = [result for result in result_data if not result.empty] keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in result_index if not results[k].empty] + keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else result_index + if keys_to_use == []: + keys_to_use = result_index + results = result_data + if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -509,7 +537,7 @@ def wrap_results_dict_like( axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 result = concat( - {k: results[k] for k in keys_to_use}, + results, axis=axis, keys=keys_to_use, ) @@ -628,7 +656,8 @@ def normalize_dictlike_arg( cols = Index(list(func.keys())).difference(obj.columns, sort=True) if len(cols) > 0: - raise KeyError(f"Column(s) {list(cols)} do not exist") + # GH 58474 + raise KeyError(f"Label(s) {list(cols)} do not exist") aggregator_types = (list, tuple, dict) @@ -663,7 +692,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs): # people may aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 + assert not any(kwarg == "axis" for kwarg in kwargs) return f elif hasattr(np, func) and hasattr(obj, "__array__"): # in particular exclude Window @@ -971,17 +1000,20 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_kwargs = {} if engine_kwargs is None else engine_kwargs - + args, kwargs = prepare_function_arguments( + self.func, # type: ignore[arg-type] + self.args, + self.kwargs, + ) # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has # incompatible type "Callable[..., Any] | str | list[Callable # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( self.func, # type: ignore[arg-type] - **engine_kwargs, + **get_jit_arguments(engine_kwargs, kwargs), ) - result = nb_looper(self.values, self.axis) + result = nb_looper(self.values, self.axis, *args) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) else: @@ -1122,21 +1154,23 @@ def generate_numba_apply_func( # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names, df_index): + def numba_func(values, col_names, df_index, *args): results = {} for j in range(values.shape[1]): # Create the series ser = Series( values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) ) - results[j] = jitted_udf(ser) + results[j] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1151,7 +1185,7 @@ def apply_with_numba(self) -> dict[int, Any]: # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict with set_numba_data(index) as index, set_numba_data(columns) as columns: - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res @property @@ -1259,7 +1293,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names_index, index): + def numba_func(values, col_names_index, index, *args): results = {} # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @@ -1271,15 +1305,17 @@ def numba_func(values, col_names_index, index): index=col_names_index, name=maybe_cast_str(index[i]), ) - results[i] = jitted_udf(ser) + results[i] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1290,7 +1326,7 @@ def apply_with_numba(self) -> dict[int, Any]: set_numba_data(self.obj.index) as index, set_numba_data(self.columns) as columns, ): - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res @@ -1824,11 +1860,13 @@ def relabel_result( com.get_callable_name(f) if not isinstance(f, str) else f for f in fun ] col_idx_order = Index(s.index).get_indexer(fun) - s = s.iloc[col_idx_order] - + valid_idx = col_idx_order != -1 + if valid_idx.any(): + s = s.iloc[col_idx_order[valid_idx]] # assign the new user-provided "named aggregation" as index names, and reindex # it based on the whole user-provided names. - s.index = reordered_indexes[idx : idx + len(fun)] + if not s.empty: + s.index = reordered_indexes[idx : idx + len(fun)] reordered_result_in_dict[col] = s.reindex(columns) idx = idx + len(fun) return reordered_result_in_dict diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 1fa610f35f56b..03c73489bd3d8 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) if self.ndim == 1: - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - name = names[0] if len(set(names)) == 1 else None + names = {getattr(x, "name") for x in inputs if hasattr(x, "name")} + name = names.pop() if len(names) == 1 else None reconstruct_kwargs = {"name": name} else: reconstruct_kwargs = {} diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index cbd0221cc2082..7b941e7ea8338 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -330,6 +330,13 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() + if limit is not None and limit < len(self): + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0240433cdb683..8c39e0d87df4e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,7 +18,6 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - NaT, Timedelta, Timestamp, timezones, @@ -1052,8 +1051,7 @@ def _pad_or_backfill( copy: bool = True, ) -> Self: if not self._hasna: - # TODO(CoW): Not necessary anymore when CoW is the default - return self.copy() + return self if limit is None and limit_area is None: method = missing.clean_fill_method(method) @@ -1084,7 +1082,6 @@ def fillna( copy: bool = True, ) -> Self: if not self._hasna: - # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() if limit is not None: @@ -1425,7 +1422,7 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result - def map(self, mapper, na_action=None): + def map(self, mapper, na_action: Literal["ignore"] | None = None): if is_numeric_dtype(self.dtype): return map_array(self.to_numpy(), mapper, na_action=na_action) else: @@ -1880,7 +1877,8 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") if len(indices) == 0: return - argsort = np.argsort(indices) + # GH#58530 wrong item assignment by repeated key + _, argsort = np.unique(indices, return_index=True) indices = indices[argsort] value = value.take(argsort) mask = np.zeros(len(self), dtype=np.bool_) @@ -2613,17 +2611,19 @@ def _str_wrap(self, width: int, **kwargs) -> Self: @property def _dt_days(self) -> Self: return type(self)( - pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + pa.array( + self._to_timedeltaarray().components.days, + from_pandas=True, + type=pa.int32(), + ) ) @property def _dt_hours(self) -> Self: return type(self)( pa.array( - [ - td.components.hours if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.hours, + from_pandas=True, type=pa.int32(), ) ) @@ -2632,10 +2632,8 @@ def _dt_hours(self) -> Self: def _dt_minutes(self) -> Self: return type(self)( pa.array( - [ - td.components.minutes if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.minutes, + from_pandas=True, type=pa.int32(), ) ) @@ -2644,7 +2642,9 @@ def _dt_minutes(self) -> Self: def _dt_seconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.seconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2652,10 +2652,8 @@ def _dt_seconds(self) -> Self: def _dt_milliseconds(self) -> Self: return type(self)( pa.array( - [ - td.components.milliseconds if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.milliseconds, + from_pandas=True, type=pa.int32(), ) ) @@ -2664,7 +2662,7 @@ def _dt_milliseconds(self) -> Self: def _dt_microseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().microseconds, + self._to_timedeltaarray().components.microseconds, from_pandas=True, type=pa.int32(), ) @@ -2674,7 +2672,9 @@ def _dt_microseconds(self) -> Self: def _dt_nanoseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.nanoseconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2971,7 +2971,7 @@ def transpose_homogeneous_pyarrow( """ arrays = list(arrays) nrows, ncols = len(arrays[0]), len(arrays) - indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() + indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1) arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) arr = arr.take(indices) return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8a2856d0a7e64..f83fdcd46b371 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -158,6 +158,12 @@ class ExtensionArray: _values_for_argsort _values_for_factorize + See Also + -------- + api.extensions.ExtensionDtype : A custom data type, to be paired with an + ExtensionArray. + api.extensions.ExtensionArray.dtype : An instance of ExtensionDtype. + Notes ----- The interface includes the following abstract methods that must be @@ -289,6 +295,13 @@ def _from_sequence( ------- ExtensionArray + See Also + -------- + api.extensions.ExtensionArray._from_sequence_of_strings : Construct a new + ExtensionArray from a sequence of strings. + api.extensions.ExtensionArray._hash_pandas_object : Hook for + hash_pandas_object. + Examples -------- >>> pd.arrays.IntegerArray._from_sequence([4, 5]) @@ -352,6 +365,16 @@ def _from_sequence_of_strings( ------- ExtensionArray + See Also + -------- + api.extensions.ExtensionArray._from_sequence : Construct a new ExtensionArray + from a sequence of scalars. + api.extensions.ExtensionArray._from_factorized : Reconstruct an ExtensionArray + after factorization. + api.extensions.ExtensionArray._from_scalars : Strict analogue to _from_sequence, + allowing only sequences of scalars that should be specifically inferred to + the given dtype. + Examples -------- >>> pd.arrays.IntegerArray._from_sequence_of_strings( @@ -597,6 +620,13 @@ def shape(self) -> Shape: """ Return a tuple of the array dimensions. + See Also + -------- + numpy.ndarray.shape : Similar attribute which returns the shape of an array. + DataFrame.shape : Return a tuple representing the dimensionality of the + DataFrame. + Series.shape : Return a tuple representing the dimensionality of the Series. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -742,7 +772,8 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: If returning an ExtensionArray, then * ``na_values._is_boolean`` should be True - * `na_values` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented Examples @@ -1045,19 +1076,12 @@ def fillna( Alternatively, an array-like "value" can be given. It's expected that the array-like have the same length as 'self'. limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. For ExtensionArray subclasses that cannot do this, it is at the author's discretion whether to ignore "copy=False" or to raise. - The base class implementation ignores the keyword in pad/backfill - cases. Returns ------- @@ -1073,6 +1097,15 @@ def fillna( Length: 6, dtype: Int64 """ mask = self.isna() + if limit is not None and limit < len(self): + # isna can return an ExtensionArray, we're assuming that comparisons + # are implemented. + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( @@ -1159,6 +1192,13 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: ExtensionArray Shifted. + See Also + -------- + api.extensions.ExtensionArray.transpose : Return a transposed view on + this array. + api.extensions.ExtensionArray.factorize : Encode the extension array as an + enumerated type. + Notes ----- If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is @@ -1294,12 +1334,23 @@ def equals(self, other: object) -> bool: boolean Whether the arrays are equivalent. + See Also + -------- + numpy.array_equal : Equivalent method for numpy array. + Series.equals : Equivalent method for Series. + DataFrame.equals : Equivalent method for DataFrame. + Examples -------- >>> arr1 = pd.array([1, 2, np.nan]) >>> arr2 = pd.array([1, 2, np.nan]) >>> arr1.equals(arr2) True + + >>> arr1 = pd.array([1, 3, np.nan]) + >>> arr2 = pd.array([1, 2, np.nan]) + >>> arr1.equals(arr2) + False """ if type(self) != type(other): return False @@ -1588,9 +1639,19 @@ def copy(self) -> Self: """ Return a copy of the array. + This method creates a copy of the `ExtensionArray` where modifying the + data in the copy will not affect the original array. This is useful when + you want to manipulate data without altering the original dataset. + Returns ------- ExtensionArray + A new `ExtensionArray` object that is a copy of the current instance. + + See Also + -------- + DataFrame.copy : Return a copy of the DataFrame. + Series.copy : Return a copy of the Series. Examples -------- @@ -1707,6 +1768,17 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: when ``boxed=False`` and :func:`str` is used when ``boxed=True``. + See Also + -------- + api.extensions.ExtensionArray._concat_same_type : Concatenate multiple + array of this dtype. + api.extensions.ExtensionArray._explode : Transform each element of + list-like to a row. + api.extensions.ExtensionArray._from_factorized : Reconstruct an + ExtensionArray after factorization. + api.extensions.ExtensionArray._from_sequence : Construct a new + ExtensionArray from a sequence of scalars. + Examples -------- >>> class MyExtensionArray(pd.arrays.NumpyExtensionArray): @@ -1783,11 +1855,21 @@ def _concat_same_type(cls, to_concat: Sequence[Self]) -> Self: Parameters ---------- to_concat : sequence of this type + An array of the same dtype to concatenate. Returns ------- ExtensionArray + See Also + -------- + api.extensions.ExtensionArray._explode : Transform each element of + list-like to a row. + api.extensions.ExtensionArray._formatter : Formatting function for + scalar values. + api.extensions.ExtensionArray._from_factorized : Reconstruct an + ExtensionArray after factorization. + Examples -------- >>> arr1 = pd.array([1, 2, 3]) @@ -1838,11 +1920,20 @@ def _accumulate( Returns ------- array + An array performing the accumulation operation. Raises ------ NotImplementedError : subclass does not define accumulations + See Also + -------- + api.extensions.ExtensionArray._concat_same_type : Concatenate multiple + array of this dtype. + api.extensions.ExtensionArray.view : Return a view on the array. + api.extensions.ExtensionArray._explode : Transform each element of + list-like to a row. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -1870,12 +1961,6 @@ def _reduce( keepdims : bool, default False If False, a scalar is returned. If True, the result has dimension with size one along the reduced axis. - - .. versionadded:: 2.1 - - This parameter is not required in the _reduce signature to keep backward - compatibility, but will become required in the future. If the parameter - is not found in the method signature, a FutureWarning will be emitted. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. @@ -1948,6 +2033,13 @@ def _hash_pandas_object( Returns ------- np.ndarray[uint64] + An array of hashed values. + + See Also + -------- + api.extensions.ExtensionArray._values_for_factorize : Return an array and + missing value suitable for factorization. + util.hash_array : Given a 1d array, return an array of hashed values. Examples -------- @@ -2224,7 +2316,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) - def map(self, mapper, na_action=None): + def map(self, mapper, na_action: Literal["ignore"] | None = None): """ Map values using an input mapping or function. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 11dea697d9b93..64e5eec43a5c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1310,7 +1310,7 @@ def add_categories(self, new_categories) -> Self: Parameters ---------- new_categories : category or list-like of category - The new categories to be included. + The new categories to be included. Returns ------- @@ -1483,7 +1483,7 @@ def remove_unused_categories(self) -> Self: def map( self, mapper, - na_action: Literal["ignore"] | None | lib.NoDefault = lib.no_default, + na_action: Literal["ignore"] | None = None, ): """ Map categories using an input mapping or function. @@ -1501,15 +1501,10 @@ def map( ---------- mapper : function, dict, or Series Mapping correspondence. - na_action : {None, 'ignore'}, default 'ignore' + na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. - .. deprecated:: 2.1.0 - - The default value of 'ignore' has been deprecated and will be changed to - None in the future. - Returns ------- pandas.Categorical or pandas.Index @@ -1561,17 +1556,6 @@ def map( >>> cat.map({"a": "first", "b": "second"}, na_action=None) Index(['first', 'second', nan], dtype='object') """ - if na_action is lib.no_default: - warnings.warn( - "The default value of 'ignore' for the `na_action` parameter in " - "pandas.Categorical.map is deprecated and will be " - "changed to 'None' in a future version. Please set na_action to the " - "desired value to avoid seeing this warning", - FutureWarning, - stacklevel=find_stack_level(), - ) - na_action = "ignore" - assert callable(mapper) or is_dict_like(mapper) new_categories = self.categories.map(mapper) @@ -2863,6 +2847,12 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): Parameters ---------- data : Series or CategoricalIndex + The object to which the categorical accessor is attached. + + See Also + -------- + Series.dt : Accessor object for datetimelike properties of the Series values. + Series.sparse : Accessor for sparse matrix data types. Examples -------- @@ -2987,6 +2977,12 @@ def codes(self) -> Series: """ Return Series of codes as well as the index. + See Also + -------- + Series.cat.categories : Return the categories of this categorical. + Series.cat.as_ordered : Set the Categorical to be ordered. + Series.cat.as_unordered : Set the Categorical to be unordered. + Examples -------- >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"]) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ab17ae43215d2..673001337767b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -728,7 +728,7 @@ def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarra # pandas assumes they're there. @ravel_compat - def map(self, mapper, na_action=None): + def map(self, mapper, na_action: Literal["ignore"] | None = None): from pandas import Index result = map_array(self, mapper, na_action=na_action) @@ -759,14 +759,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: values = ensure_wrapped_if_datetimelike(values) if not isinstance(values, type(self)): - inferable = [ - "timedelta", - "timedelta64", - "datetime", - "datetime64", - "date", - "period", - ] if values.dtype == object: values = lib.maybe_convert_objects( values, # type: ignore[arg-type] @@ -775,32 +767,11 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: ) if values.dtype != object: return self.isin(values) - - inferred = lib.infer_dtype(values, skipna=False) - if inferred not in inferable: - if inferred == "string": - pass - - elif "mixed" in inferred: - return isin(self.astype(object), values) - else: - return np.zeros(self.shape, dtype=bool) - - try: - values = type(self)._from_sequence(values) - except ValueError: - return isin(self.astype(object), values) - else: - warnings.warn( - # GH#53111 - f"The behavior of 'isin' with dtype={self.dtype} and " - "castable values (e.g. strings) is deprecated. In a " - "future version, these will not be considered matching " - "by isin. Explicitly cast to the appropriate dtype before " - "calling isin instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + else: + # TODO: Deprecate this case + # https://github.com/pandas-dev/pandas/pull/58645/files#r1604055791 + return isin(self.astype(object), values) + return np.zeros(self.shape, dtype=bool) if self.dtype.kind in "mM": self = cast("DatetimeArray | TimedeltaArray", self) @@ -1878,11 +1849,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _floor_example = """>>> rng.floor('h') @@ -1905,11 +1876,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _ceil_example = """>>> rng.ceil('h') @@ -1932,11 +1903,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8747f795bebd8..e0a4587535cfd 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -143,10 +143,14 @@ def f(self): month_kw = 12 if freq: kwds = freq.kwds - month_kw = kwds.get("startingMonth", kwds.get("month", 12)) + month_kw = kwds.get("startingMonth", kwds.get("month", month_kw)) + if freq is not None: + freq_name = freq.name + else: + freq_name = None result = fields.get_start_end_field( - values, field, self.freqstr, month_kw, reso=self._creso + values, field, freq_name, month_kw, reso=self._creso ) else: result = fields.get_date_field(values, field, reso=self._creso) @@ -214,7 +218,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ... ) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] - Length: 2, dtype: datetime64[ns] + Length: 2, dtype: datetime64[s] """ _typ = "datetimearray" @@ -609,7 +613,7 @@ def tz(self) -> tzinfo | None: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.tz datetime.timezone.utc @@ -1043,7 +1047,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -1055,14 +1059,14 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + ... '2015-03-29 03:30:00'], dtype="M8[ns]")) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 @@ -1423,7 +1427,7 @@ def time(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.time 0 10:00:00 1 11:00:00 @@ -1466,7 +1470,7 @@ def timetz(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.timetz 0 10:00:00+00:00 1 11:00:00+00:00 @@ -1508,7 +1512,7 @@ def date(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.date 0 2020-01-01 1 2020-02-01 @@ -1857,7 +1861,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.dayofyear 0 1 1 32 @@ -1893,7 +1897,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-04-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.quarter 0 1 1 2 @@ -1913,6 +1917,15 @@ def isocalendar(self) -> DataFrame: """ The number of days in the month. + See Also + -------- + Series.dt.day : Return the day of the month. + Series.dt.is_month_end : Return a boolean indicating if the + date is the last day of the month. + Series.dt.is_month_start : Return a boolean indicating if the + date is the first day of the month. + Series.dt.month : Return the month as January=1 through December=12. + Examples -------- >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) @@ -1920,7 +1933,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.daysinmonth 0 31 1 29 @@ -2106,6 +2119,32 @@ def isocalendar(self) -> DataFrame: >>> idx.is_year_start array([False, False, True]) + + This method, when applied to Series with datetime values under + the ``.dt`` accessor, will lose information about Business offsets. + + >>> dates = pd.Series(pd.date_range("2020-10-30", periods=4, freq="BYS")) + >>> dates + 0 2021-01-01 + 1 2022-01-03 + 2 2023-01-02 + 3 2024-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_start + 0 True + 1 False + 2 False + 3 True + dtype: bool + + >>> idx = pd.date_range("2020-10-30", periods=4, freq="BYS") + >>> idx + DatetimeIndex(['2021-01-01', '2022-01-03', '2023-01-02', '2024-01-01'], + dtype='datetime64[ns]', freq='BYS-JAN') + + >>> idx.is_year_start + array([ True, True, True, True]) """, ) is_year_end = _field_accessor( @@ -2359,9 +2398,9 @@ def _sequence_to_dt64( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - if out_unit is None: - out_unit = "ns" - out_dtype = np.dtype(f"M8[{out_unit}]") + out_dtype = DT64NS_DTYPE + if out_unit is not None: + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, @@ -2387,7 +2426,7 @@ def _sequence_to_dt64( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - out_unit=out_unit or "ns", + out_unit=out_unit, ) copy = False if tz and inferred_tz: @@ -2495,7 +2534,7 @@ def objects_to_datetime64( utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, - out_unit: str = "ns", + out_unit: str | None = None, ) -> tuple[np.ndarray, tzinfo | None]: """ Convert data to array of timestamps. @@ -2511,7 +2550,8 @@ def objects_to_datetime64( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - out_unit : str, default "ns" + out_unit : str or None, default None + None indicates we should do resolution inference. Returns ------- diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 86f58b48ea3be..2e1ea7236e5c4 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -905,12 +905,7 @@ def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: value(s) passed should be either Interval objects or NA/NaN. limit : int, default None (Not implemented yet for IntervalArray) - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. @@ -923,6 +918,8 @@ def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: """ if copy is False: raise NotImplementedError + if limit is not None: + raise ValueError("limit must be None") value_left, value_right = self._validate_scalar(value) @@ -1392,6 +1389,12 @@ def closed(self) -> IntervalClosedType: Either ``left``, ``right``, ``both`` or ``neither``. + See Also + -------- + IntervalArray.closed : Returns inclusive side of the IntervalArray. + Interval.closed : Returns inclusive side of the Interval. + IntervalIndex.closed : Returns inclusive side of the IntervalIndex. + Examples -------- @@ -1433,12 +1436,26 @@ def closed(self) -> IntervalClosedType: """ ) - @Appender( - _interval_shared_docs["set_closed"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + def set_closed(self, closed: IntervalClosedType) -> Self: + """ + Return an identical IntervalArray closed on the specified side. + + Parameters + ---------- + closed : {'left', 'right', 'both', 'neither'} + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + IntervalArray + A new IntervalArray with the specified side closures. + + See Also + -------- + IntervalArray.closed : Returns inclusive side of the Interval. + arrays.IntervalArray.closed : Returns inclusive side of the IntervalArray. + Examples -------- >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) @@ -1446,15 +1463,11 @@ def closed(self) -> IntervalClosedType: [(0, 1], (1, 2], (2, 3]] Length: 3, dtype: interval[int64, right] - >>> index.set_closed('both') + >>> index.set_closed("both") [[0, 1], [1, 2], [2, 3]] Length: 3, dtype: interval[int64, both] """ - ), - } - ) - def set_closed(self, closed: IntervalClosedType) -> Self: if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) @@ -1506,10 +1519,54 @@ def set_closed(self, closed: IntervalClosedType) -> Self: """ @property - @Appender( - _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs - ) def is_non_overlapping_monotonic(self) -> bool: + """ + Return a boolean whether the IntervalArray/IntervalIndex\ + is non-overlapping and monotonic. + + Non-overlapping means (no Intervals share points), and monotonic means + either monotonic increasing or monotonic decreasing. + + See Also + -------- + overlaps : Check if two IntervalIndex objects overlap. + + Examples + -------- + For arrays: + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.is_non_overlapping_monotonic + True + + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(-1, 0.1)] + ... ) + >>> interv_arr + + [(0.0, 1.0], (-1.0, 0.1]] + Length: 2, dtype: interval[float64, right] + >>> interv_arr.is_non_overlapping_monotonic + False + + For Interval Index: + + >>> interv_idx = pd.interval_range(start=0, end=2) + >>> interv_idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> interv_idx.is_non_overlapping_monotonic + True + + >>> interv_idx = pd.interval_range(start=0, end=2, closed="both") + >>> interv_idx + IntervalIndex([[0, 1], [1, 2]], dtype='interval[int64, both]') + >>> interv_idx.is_non_overlapping_monotonic + False + """ # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) # we already require left <= right @@ -1621,39 +1678,51 @@ def __arrow_array__(self, type=None): """ ) - @Appender( - _interval_shared_docs["to_tuples"] - % { - "return_type": ( - "ndarray (if self is IntervalArray) or Index (if self is IntervalIndex)" - ), - "examples": textwrap.dedent( - """\ - - Examples - -------- - For :class:`pandas.IntervalArray`: - - >>> idx = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) - >>> idx - - [(0, 1], (1, 2]] - Length: 2, dtype: interval[int64, right] - >>> idx.to_tuples() - array([(0, 1), (1, 2)], dtype=object) - - For :class:`pandas.IntervalIndex`: - - >>> idx = pd.interval_range(start=0, end=2) - >>> idx - IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') - >>> idx.to_tuples() - Index([(0, 1), (1, 2)], dtype='object') - """ - ), - } - ) def to_tuples(self, na_tuple: bool = True) -> np.ndarray: + """ + Return an ndarray (if self is IntervalArray) or Index \ + (if self is IntervalIndex) of tuples of the form (left, right). + + Parameters + ---------- + na_tuple : bool, default True + If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``, + just return ``NA`` as ``nan``. + + Returns + ------- + ndarray or Index + An ndarray of tuples representing the intervals + if `self` is an IntervalArray. + An Index of tuples representing the intervals + if `self` is an IntervalIndex. + + See Also + -------- + IntervalArray.to_list : Convert IntervalArray to a list of tuples. + IntervalArray.to_numpy : Convert IntervalArray to a numpy array. + IntervalArray.unique : Find unique intervals in an IntervalArray. + + Examples + -------- + For :class:`pandas.IntervalArray`: + + >>> idx = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + >>> idx + + [(0, 1], (1, 2]] + Length: 2, dtype: interval[int64, right] + >>> idx.to_tuples() + array([(0, 1), (1, 2)], dtype=object) + + For :class:`pandas.IntervalIndex`: + + >>> idx = pd.interval_range(start=0, end=2) + >>> idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> idx.to_tuples() + Index([(0, 1), (1, 2)], dtype='object') + """ tuples = com.asarray_tuplesafe(zip(self._left, self._right)) if not na_tuple: # GH 18756 @@ -1750,22 +1819,40 @@ def repeat( """ ) - @Appender( - _interval_shared_docs["contains"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + def contains(self, other): + """ + Check elementwise if the Intervals contain the value. + + Return a boolean mask whether the value is contained in the Intervals + of the IntervalArray. + + Parameters + ---------- + other : scalar + The value to check whether it is contained in the Intervals. + + Returns + ------- + boolean array + A boolean mask whether the value is contained in the Intervals. + + See Also + -------- + Interval.contains : Check whether Interval object contains value. + IntervalArray.overlaps : Check if an Interval overlaps the values in the + IntervalArray. + + Examples + -------- >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) >>> intervals [(0, 1], (1, 3], (2, 4]] Length: 3, dtype: interval[int64, right] + + >>> intervals.contains(0.5) + array([ True, False, False]) """ - ), - } - ) - def contains(self, other): if isinstance(other, Interval): raise NotImplementedError("contains not implemented for two intervals") diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index df794183f67d1..04cffcaaa5f04 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -232,6 +232,12 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self._mask + if limit is not None and limit < len(self): + modify = mask.cumsum() > limit + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False value = missing.check_value_size(value, mask, len(self)) @@ -1312,7 +1318,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): ) return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) - def map(self, mapper, na_action=None): + def map(self, mapper, na_action: Literal["ignore"] | None = None): return map_array(self.to_numpy(), mapper, na_action=na_action) @overload diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 522d86fb165f6..adf8f44377e62 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -717,6 +717,7 @@ def fillna( ---------- value : scalar limit : int, optional + Not supported for SparseArray, must be None. copy: bool, default True Ignored for SparseArray. @@ -736,6 +737,8 @@ def fillna( When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ + if limit is not None: + raise ValueError("limit must be None") new_values = np.where(isna(self.sp_values), value, self.sp_values) if self._null_fill_value: @@ -1250,7 +1253,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): return self._simple_new(sp_values, self.sp_index, dtype) - def map(self, mapper, na_action=None) -> Self: + def map(self, mapper, na_action: Literal["ignore"] | None = None) -> Self: """ Map categories using an input mapping or function. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ec2534ce174ac..f2fd9d5d6610f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -196,13 +196,13 @@ def _from_sequence( na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return cls(pa.array(result, mask=na_values, type=pa.large_string())) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): - return cls(pc.cast(scalars, pa.string())) + return cls(pc.cast(scalars, pa.large_string())) # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) - return cls(pa.array(result, type=pa.string(), from_pandas=True)) + return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( @@ -245,7 +245,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] - if pa_scalar.type in (pa.string(), pa.null()) + if pa_scalar.type in (pa.string(), pa.null(), pa.large_string()) ] # short-circuit to return all False array. @@ -332,7 +332,9 @@ def _str_map( result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) return type(self)(result) else: # This is when the result type is object. We reach this when @@ -627,35 +629,34 @@ def _str_map( na_value = np.nan else: na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result + + dtype = np.dtype(cast(type, dtype)) + if mask.any(): + # numpy int/bool dtypes cannot hold NaNs so we must convert to + # float64 for int (to match maybe_convert_objects) or + # object for bool (again to match maybe_convert_objects) + if is_integer_dtype(dtype): + dtype = np.dtype("float64") + else: + dtype = np.dtype(object) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=dtype, + ) + return result elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) return type(self)(result) else: # This is when the result type is object. We reach this when diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index ff43f97161136..865e81d7754ef 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -799,6 +799,12 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: days_docstring = textwrap.dedent( """Number of days for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.microseconds : Return number of microseconds for each element. + Series.dt.nanoseconds : Return number of nanoseconds for each element. + Examples -------- For Series: diff --git a/pandas/core/base.py b/pandas/core/base.py index 87e87538ca1d9..b784dc8b03292 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -342,6 +342,12 @@ def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. + See Also + -------- + Series.ndim : Number of dimensions of the underlying data. + Series.size : Return the number of elements in the underlying data. + Series.nbytes : Return the number of bytes in the underlying data. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -697,6 +703,11 @@ def empty(self) -> bool: """ Indicator whether Index is empty. + An Index is considered empty if it has no elements. This property can be + useful for quickly checking the state of an Index, especially in data + processing and analysis workflows where handling of empty datasets might + be required. + Returns ------- bool @@ -708,10 +719,10 @@ def empty(self) -> bool: Examples -------- - >>> idx_empty = pd.Index([1, 2, 3]) - >>> idx_empty + >>> idx = pd.Index([1, 2, 3]) + >>> idx Index([1, 2, 3], dtype='int64') - >>> idx_empty.empty + >>> idx.empty False >>> idx_empty = pd.Index([]) @@ -722,10 +733,10 @@ def empty(self) -> bool: If we only have NaNs in our DataFrame, it is not considered empty! - >>> idx_empty = pd.Index([np.nan, np.nan]) - >>> idx_empty + >>> idx = pd.Index([np.nan, np.nan]) + >>> idx Index([nan, nan], dtype='float64') - >>> idx_empty.empty + >>> idx.empty False """ return not self.size @@ -870,6 +881,11 @@ def __iter__(self) -> Iterator: Returns ------- iterator + An iterator yielding scalar values from the Series. + + See Also + -------- + Series.items : Lazily iterate over (index, value) tuples. Examples -------- @@ -898,6 +914,11 @@ def hasnans(self) -> bool: ------- bool + See Also + -------- + Series.isna : Detect missing values. + Series.notna : Detect existing (non-missing) values. + Examples -------- >>> s = pd.Series([1, 2, 3, None]) @@ -1097,6 +1118,12 @@ def is_unique(self) -> bool: ------- bool + See Also + -------- + Series.unique : Return unique values of Series object. + Series.drop_duplicates : Return Series with duplicate values removed. + Series.duplicated : Indicate duplicate Series values. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -1118,6 +1145,11 @@ def is_monotonic_increasing(self) -> bool: ------- bool + See Also + -------- + Series.is_monotonic_decreasing : Return boolean if values in the object are + monotonically decreasing. + Examples -------- >>> s = pd.Series([1, 2, 2]) @@ -1141,6 +1173,11 @@ def is_monotonic_decreasing(self) -> bool: ------- bool + See Also + -------- + Series.is_monotonic_increasing : Return boolean if values in the object are + monotonically increasing. + Examples -------- >>> s = pd.Series([3, 2, 2, 1]) @@ -1297,7 +1334,7 @@ def factorize( 0 2000-03-11 1 2000-03-12 2 2000-03-13 - dtype: datetime64[ns] + dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') 3 diff --git a/pandas/core/common.py b/pandas/core/common.py index 77e986a26fbe9..96291991227d9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -335,11 +335,12 @@ def is_empty_slice(obj) -> bool: ) -def is_true_slices(line) -> list[bool]: +def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]: """ - Find non-trivial slices in "line": return a list of booleans with same length. + Find non-trivial slices in "line": yields a bool. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in line] + for k in line: + yield isinstance(k, slice) and not is_null_slice(k) # TODO: used only once in indexing; belongs elsewhere? diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index c949cfd1bc657..fee08c6199eef 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -193,6 +193,8 @@ def eval( corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. + `eval` can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index b7a1cb173f659..d69765e91f467 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( is_list_like, + is_numeric_dtype, is_scalar, ) @@ -508,10 +509,6 @@ def _disallow_scalar_only_bool_ops(self) -> None: raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype) -> bool: - return issubclass(np.dtype(dtype).type, np.number) - - class Div(BinOp): """ Div operator to special case casting. @@ -525,7 +522,9 @@ class Div(BinOp): def __init__(self, lhs, rhs) -> None: super().__init__("/", lhs, rhs) - if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): + if not is_numeric_dtype(lhs.return_type) or not is_numeric_dtype( + rhs.return_type + ): raise TypeError( f"unsupported operand type(s) for {self.op}: " f"'{lhs.return_type}' and '{rhs.return_type}'" diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2718e9819cdf8..360e1d5ddd3ff 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -38,7 +38,6 @@ ensure_object, is_list_like, is_object_dtype, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -555,9 +554,7 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - object_index = False - if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: - object_index = True + infer_object = not isinstance(data, (ABCIndex, ABCSeries)) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -610,15 +607,8 @@ def sanitize_array( if dtype is None: subarr = data - if data.dtype == object: + if data.dtype == object and infer_object: subarr = maybe_infer_to_datetimelike(data) - if ( - object_index - and using_pyarrow_string_dtype() - and is_string_dtype(subarr) - ): - # Avoid inference when string option is set - subarr = data elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 2f8e59cd6e89c..d8a42d83b6c54 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -486,6 +486,14 @@ def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDt callable A class decorator. + See Also + -------- + api.extensions.ExtensionDtype : The base class for creating custom pandas + data types. + Series : One-dimensional array with axis labels. + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous + tabular data. + Examples -------- >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a130983337f64..662b8c5791e51 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1193,7 +1193,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, - dtype_if_all_nat=np.dtype("M8[ns]"), + dtype_if_all_nat=np.dtype("M8[s]"), ) @@ -1697,7 +1697,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n ) raise ValueError("Trying to coerce float values to integers") if arr.dtype == object: - raise ValueError("Trying to coerce float values to integers") + raise ValueError("Trying to coerce object values to integers") if casted.dtype < arr.dtype: # TODO: Can this path be hit anymore with numpy > 2 diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4d8d3c2816f69..2ac75a0700759 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -362,6 +362,13 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the timedelta64 dtype. + See Also + -------- + api.types.is_timedelta64_ns_dtype : Check whether the provided array or dtype is + of the timedelta64[ns] dtype. + api.types.is_period_dtype : Check whether an array-like or dtype is of the + Period dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_timedelta64_dtype @@ -873,6 +880,15 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: bool Whether or not the array or dtype is of the datetime64 dtype. + See Also + -------- + api.types.is_datetime64_dtype : Check whether an array-like or dtype is of the + datetime64 dtype. + api.is_datetime64_ns_dtype : Check whether the provided array or dtype is of the + datetime64[ns] dtype. + api.is_datetime64tz_dtype : Check whether an array-like or dtype is of a + DatetimeTZDtype dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_any_dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e52cbff451700..5213be8b69016 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -205,7 +205,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): by providing an empty index. As follows, >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype - dtype(' DtypeObj | None: return None # categorical is aware of Sparse -> extract sparse subdtypes - dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) # extract the categories' dtype non_cat_dtypes = [ - x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + x.categories.dtype if isinstance(x, CategoricalDtype) else x + for x in subtypes ] # TODO should categorical always give an answer? from pandas.core.dtypes.cast import find_common_type @@ -1195,6 +1196,9 @@ class IntervalDtype(PandasExtensionDtype): ---------- subtype : str, np.dtype The dtype of the Interval bounds. + closed : {'right', 'left', 'both', 'neither'}, default 'right' + Whether the interval is closed on the left-side, right-side, both or + neither. See the Notes for more detailed explanation. Attributes ---------- @@ -1204,6 +1208,10 @@ class IntervalDtype(PandasExtensionDtype): ------- None + See Also + -------- + PeriodDtype : An ExtensionDtype for Period data. + Examples -------- >>> pd.IntervalDtype(subtype="int64", closed="both") @@ -1304,6 +1312,10 @@ def subtype(self): """ The dtype of the Interval bounds. + See Also + -------- + IntervalDtype: An ExtensionDtype for Interval data. + Examples -------- >>> dtype = pd.IntervalDtype(subtype="int64", closed="both") @@ -1654,7 +1666,10 @@ class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. - This dtype implements the pandas ExtensionDtype interface. + `SparseDtype` is used as the data type for :class:`SparseArray`, enabling + more efficient storage of data that contains a significant number of + repetitive values typically represented by a fill value. It supports any + scalar dtype as the underlying data type of the non-fill values. Parameters ---------- @@ -1684,6 +1699,11 @@ class SparseDtype(ExtensionDtype): ------- None + See Also + -------- + arrays.SparseArray : The array structure that uses SparseDtype + for data representation. + Examples -------- >>> ser = pd.Series([1, 0, 0], dtype=pd.SparseDtype(dtype=int, fill_value=0)) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f127c736e745a..f0e21136f8a97 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -148,7 +148,7 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.isna(index) array([False, False, True, False]) @@ -362,7 +362,7 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.notna(index) array([ True, True, False, True]) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 88e4d695b8328..0aeda77233125 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -21,7 +21,6 @@ Sequence, ) import functools -from inspect import signature from io import StringIO import itertools import operator @@ -125,7 +124,7 @@ ops, roperator, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor from pandas.core.apply import reconstruct_and_relabel_result from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin @@ -729,10 +728,6 @@ def __init__( NDFrame.__init__(self, data) return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -897,18 +892,6 @@ def __init__( NDFrame.__init__(self, mgr) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtypes.iloc[0] != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The DataFrame " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - # ---------------------------------------------------------------------- def __dataframe__( @@ -1063,7 +1046,7 @@ def _is_homogeneous_type(self) -> bool: False """ # The "<" part of "<=" here is for empty DataFrame cases - return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + return len({block.values.dtype for block in self._mgr.blocks}) <= 1 @property def _can_fast_transpose(self) -> bool: @@ -2749,11 +2732,55 @@ def to_markdown( **kwargs, ) -> str | None: ... - @doc( - Series.to_markdown, - klass=_shared_doc_kwargs["klass"], - storage_options=_shared_docs["storage_options"], - examples="""Examples + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> str | None: + """ + Print DataFrame in Markdown-friendly format. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + mode : str, optional + Mode in which file is opened, "wt" by default. + index : bool, optional, default True + Add index (row) labels. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + + **kwargs + These parameters will be passed to `tabulate `_. + + Returns + ------- + str + DataFrame in Markdown-friendly format. + + See Also + -------- + DataFrame.to_html : Render DataFrame to HTML-formatted table. + DataFrame.to_latex : Render DataFrame to LaTeX-formatted table. + + Notes + ----- + Requires the `tabulate `_ package. + + Examples -------- >>> df = pd.DataFrame( ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} @@ -2773,17 +2800,8 @@ def to_markdown( | 0 | elk | dog | +----+------------+------------+ | 1 | pig | quetzal | - +----+------------+------------+""", - ) - def to_markdown( - self, - buf: FilePath | WriteBuffer[str] | None = None, - *, - mode: str = "wt", - index: bool = True, - storage_options: StorageOptions | None = None, - **kwargs, - ) -> str | None: + +----+------------+------------+ + """ if "showindex" in kwargs: raise ValueError("Pass 'index' instead of 'showindex") @@ -4438,6 +4456,9 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No """ Query the columns of a DataFrame with a boolean expression. + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. + Parameters ---------- expr : str @@ -4540,36 +4561,44 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No Examples -------- >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C C": range(10, 5, -1)} + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} ... ) >>> df - A B C C + A B C&C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 >>> df.query("A > B") - A B C C + A B C&C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B C C + A B C&C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. - >>> df.query("B == `C C`") - A B C C + >>> df.query("B == `C&C`") + A B C&C 0 1 10 10 The previous expression is equivalent to - >>> df[df.B == df["C C"]] - A B C C + >>> df[df.B == df["C&C"]] + A B C&C 0 1 10 10 + + Using local variable: + + >>> local_var = 2 + >>> df.query("A <= @local_var") + A B C&C + 0 1 10 10 + 1 2 8 9 """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): @@ -4610,6 +4639,13 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: ---------- expr : str The expression string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable + names by surrounding them with backticks `````. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4641,14 +4677,16 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Examples -------- - >>> df = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2)}) + >>> df = pd.DataFrame( + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} + ... ) >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 >>> df.eval("A + B") 0 11 1 10 @@ -4660,35 +4698,55 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Assignment is allowed though by default the original DataFrame is not modified. - >>> df.eval("C = A + B") - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 + >>> df.eval("D = A + B") + A B C&C D + 0 1 10 10 11 + 1 2 8 9 10 + 2 3 6 8 9 + 3 4 4 7 8 + 4 5 2 6 7 >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' - ... C = A + B - ... D = A - B + ... D = A + B + ... E = A - B ... ''' ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 + A B C&C D E + 0 1 10 10 11 -9 + 1 2 8 9 10 -6 + 2 3 6 8 9 -3 + 3 4 4 7 8 0 + 4 5 2 6 7 3 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.eval("B * `C&C`") + 0 100 + 1 72 + 2 48 + 3 28 + 4 12 + + Local variables shall be explicitly referenced using ``@`` + character in front of the name: + + >>> local_var = 2 + >>> df.eval("@local_var * A") + 0 2 + 1 4 + 2 6 + 3 8 + 4 10 """ from pandas.core.computation.eval import eval as _eval @@ -5024,22 +5082,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - arr = sanitize_array(value, self.index, copy=True, allow_2d=True) - if ( - isinstance(value, Index) - and value.dtype == "object" - and arr.dtype != value.dtype - ): # - # TODO: Remove kludge in sanitize_array for string mode when enforcing - # this deprecation - warnings.warn( - "Setting an Index with object dtype into a DataFrame will stop " - "inferring another dtype in a future version. Cast the Index " - "explicitly before setting it into the DataFrame.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return arr, None + return sanitize_array(value, self.index, copy=True, allow_2d=True), None @property def _series(self): @@ -5683,7 +5726,6 @@ def shift( periods = cast(int, periods) ncols = len(self.columns) - arrays = self._mgr.arrays if axis == 1 and periods != 0 and ncols > 0 and freq is None: if fill_value is lib.no_default: # We will infer fill_value to match the closest column @@ -5709,12 +5751,12 @@ def shift( result.columns = self.columns.copy() return result - elif len(arrays) > 1 or ( + elif len(self._mgr.blocks) > 1 or ( # If we only have one block and we know that we can't # keep the same dtype (i.e. the _can_hold_element check) # then we can go through the reindex_indexer path # (and avoid casting logic in the Block method). - not can_hold_element(arrays[0], fill_value) + not can_hold_element(self._mgr.blocks[0].values, fill_value) ): # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default @@ -6980,19 +7022,19 @@ def sort_values( f" != length of by ({len(by)})" ) if len(by) > 1: - keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + keys = (self._get_label_or_level_values(x, axis=axis) for x in by) # need to rewrap columns in Series to apply key function if key is not None: - # error: List comprehension has incompatible type List[Series]; - # expected List[ndarray] - keys = [ - Series(k, name=name) # type: ignore[misc] - for (k, name) in zip(keys, by) - ] + keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)] + else: + # error: Argument 1 to "list" has incompatible type + # "Generator[ExtensionArray | ndarray[Any, Any], None, None]"; + # expected "Iterable[Series]" + keys_data = list(keys) # type: ignore[arg-type] indexer = lexsort_indexer( - keys, orders=ascending, na_position=na_position, key=key + keys_data, orders=ascending, na_position=na_position, key=key ) elif len(by): # len(by) == 1 @@ -7607,16 +7649,30 @@ def nsmallest( """ return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() - @doc( - Series.swaplevel, - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise.""" - ), - examples=dedent( - """\ + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + """ + Swap levels i and j in a :class:`MultiIndex`. + + Default is to swap the two innermost levels of the index. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise. + + Returns + ------- + DataFrame + DataFrame with levels swapped in MultiIndex. + + See Also + -------- + DataFrame.reorder_levels: Reorder levels of MultiIndex. + DataFrame.sort_index: Sort MultiIndex. + Examples -------- >>> df = pd.DataFrame( @@ -7666,10 +7722,8 @@ def nsmallest( History Final exam January A Geography Final exam February B History Coursework March A - Geography Coursework April C""" - ), - ) - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + Geography Coursework April C + """ result = self.copy(deep=False) axis = self._get_axis_number(axis) @@ -9221,6 +9275,11 @@ def pivot( .. versionadded:: 1.3.0 + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + Returns ------- DataFrame @@ -9328,6 +9387,7 @@ def pivot_table( margins_name: Level = "All", observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table @@ -9343,6 +9403,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + **kwargs, ) def stack( @@ -10322,7 +10383,7 @@ def apply( return op.apply().__finalize__(self, method="apply") def map( - self, func: PythonFuncType, na_action: str | None = None, **kwargs + self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs ) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -11376,28 +11437,11 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) - dtype_has_keepdims: dict[ExtensionDtype, bool] = {} - def blk_func(values, axis: Axis = 1): if isinstance(values, ExtensionArray): if not is_1d_only_ea_dtype(values.dtype): return values._reduce(name, axis=1, skipna=skipna, **kwds) - has_keepdims = dtype_has_keepdims.get(values.dtype) - if has_keepdims is None: - sign = signature(values._reduce) - has_keepdims = "keepdims" in sign.parameters - dtype_has_keepdims[values.dtype] = has_keepdims - if has_keepdims: - return values._reduce(name, skipna=skipna, keepdims=True, **kwds) - else: - warnings.warn( - f"{type(values)}._reduce will require a `keepdims` parameter " - "in the future", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = values._reduce(name, skipna=skipna, **kwds) - return np.array([result]) + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) @@ -11415,7 +11459,7 @@ def _get_data() -> DataFrame: if numeric_only: df = _get_data() if axis is None: - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type([block.values.dtype for block in df._mgr.blocks]) if isinstance(dtype, ExtensionDtype): df = df.astype(dtype) arr = concat_compat(list(df._iter_column_arrays())) @@ -11440,7 +11484,9 @@ def _get_data() -> DataFrame: # kurtosis excluded since groupby does not implement it if df.shape[1] and name != "kurt": - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type( + [block.values.dtype for block in df._mgr.blocks] + ) if isinstance(dtype, ExtensionDtype): # GH 54341: fastpath for EA-backed axis=1 reductions # This flattens the frame into a single 1D array while keeping @@ -11514,8 +11560,8 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: else: raise NotImplementedError(name) - for arr in self._mgr.arrays: - middle = func(arr, axis=0, skipna=skipna) + for blocks in self._mgr.blocks: + middle = func(blocks.values, axis=0, skipna=skipna) result = ufunc(result, middle) res_ser = self._constructor_sliced(result, index=self.index, copy=False) @@ -11709,7 +11755,6 @@ def max( return result @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") - @doc(make_doc("sum", ndim=2)) def sum( self, axis: Axis | None = 0, @@ -11718,6 +11763,87 @@ def sum( min_count: int = 0, **kwargs, ) -> Series: + """ + Return the sum of the values over the requested axis. + + This is equivalent to the method ``numpy.sum``. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sum with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Sum over requested axis. + + See Also + -------- + Series.sum : Return the sum over Series values. + DataFrame.mean : Return the mean of the values over the requested axis. + DataFrame.median : Return the median of the values over the requested axis. + DataFrame.mode : Get the mode(s) of each element along the requested axis. + DataFrame.std : Return the standard deviation of the values over the + requested axis. + + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.sum() + 14 + + By default, the sum of an empty or all-NA Series is ``0``. + + >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default + 0.0 + + This can be controlled with the ``min_count`` parameter. For example, if + you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + + >>> pd.Series([], dtype="float64").sum(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. + + >>> pd.Series([np.nan]).sum() + 0.0 + + >>> pd.Series([np.nan]).sum(min_count=1) + nan + """ result = super().sum( axis=axis, skipna=skipna, @@ -11945,7 +12071,6 @@ def sem( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") - @doc(make_doc("sem", ndim=2)) def sem( self, axis: Axis | None = 0, @@ -11954,6 +12079,76 @@ def sem( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sem with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + Series or DataFrame (if level specified) + Unbiased standard error of the mean over requested axis. + + See Also + -------- + DataFrame.var : Return unbiased variance over requested axis. + DataFrame.std : Returns sample standard deviation over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.sem().round(6) + 0.57735 + + With a DataFrame + + >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.sem() + a 0.5 + b 0.5 + dtype: float64 + + Using axis=1 + + >>> df.sem(axis=1) + tiger 0.5 + zebra 0.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) + >>> df.sem(numeric_only=True) + a 0.5 + dtype: float64 + """ result = super().sem( axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs ) @@ -11996,7 +12191,6 @@ def var( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") - @doc(make_doc("var", ndim=2)) def var( self, axis: Axis | None = 0, @@ -12005,6 +12199,75 @@ def var( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.var with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + Series or scalaer + Unbiased variance over requested axis. + + See Also + -------- + numpy.var : Equivalent function in NumPy. + Series.var : Return unbiased variance over Series values. + Series.std : Return standard deviation over Series values. + DataFrame.std : Return standard deviation of the values over + the requested axis. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "person_id": [0, 1, 2, 3], + ... "age": [21, 25, 62, 43], + ... "height": [1.61, 1.87, 1.49, 2.01], + ... } + ... ).set_index("person_id") + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + >>> df.var() + age 352.916667 + height 0.056367 + dtype: float64 + + Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: + + >>> df.var(ddof=0) + age 264.687500 + height 0.042275 + dtype: float64 + """ result = super().var( axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs ) @@ -12047,7 +12310,6 @@ def std( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") - @doc(make_doc("std", ndim=2)) def std( self, axis: Axis | None = 0, @@ -12056,6 +12318,82 @@ def std( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return sample standard deviation over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.std with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : dict + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Standard deviation over requested axis. + + See Also + -------- + Series.std : Return standard deviation over Series values. + DataFrame.mean : Return the mean of the values over the requested axis. + DataFrame.mediam : Return the mediam of the values over the requested axis. + DataFrame.mode : Get the mode(s) of each element along the requested axis. + DataFrame.sum : Return the sum of the values over the requested axis. + + Notes + ----- + To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the + default `ddof=1`) + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "person_id": [0, 1, 2, 3], + ... "age": [21, 25, 62, 43], + ... "height": [1.61, 1.87, 1.49, 2.01], + ... } + ... ).set_index("person_id") + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + The standard deviation of the columns can be found as follows: + + >>> df.std() + age 18.786076 + height 0.237417 + dtype: float64 + + Alternatively, `ddof=0` can be set to normalize by N instead of N-1: + + >>> df.std(ddof=0) + age 16.269219 + height 0.205609 + dtype: float64 + """ result = super().std( axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs ) @@ -12740,7 +13078,7 @@ def quantile( if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns - cols = Index([], name=self.columns.name) + cols = self.columns[:0] dtype = np.float64 if axis == 1: @@ -12940,7 +13278,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') @@ -13163,10 +13501,10 @@ def isin_(x): # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + plot = Accessor("plot", pandas.plotting.PlotAccessor) hist = pandas.plotting.hist_frame boxplot = pandas.plotting.boxplot_frame - sparse = CachedAccessor("sparse", SparseFrameAccessor) + sparse = Accessor("sparse", SparseFrameAccessor) # ---------------------------------------------------------------------- # Internal Interface Methods diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 24727bb9d83c1..93068c665a880 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -93,7 +93,10 @@ InvalidIndexError, ) from pandas.errors.cow import _chained_assignment_method_msg -from pandas.util._decorators import doc +from pandas.util._decorators import ( + deprecate_kwarg, + doc, +) from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( check_dtype_backend, @@ -155,7 +158,6 @@ Index, MultiIndex, PeriodIndex, - RangeIndex, default_index, ensure_index, ) @@ -1747,11 +1749,15 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike if `key` matches multiple labels """ axis = self._get_axis_number(axis) - other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + first_other_axes = next( + (ax for ax in range(self._AXIS_LEN) if ax != axis), None + ) if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) - values = self.xs(key, axis=other_axes[0])._values + if first_other_axes is None: + raise ValueError("axis matched all axes") + values = self.xs(key, axis=first_other_axes)._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values else: @@ -1759,7 +1765,9 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike # Check for duplicates if values.ndim > 1: - if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + if first_other_axes is not None and isinstance( + self._get_axis(first_other_axes), MultiIndex + ): multi_message = ( "\n" "For a multi-index, the label must be a " @@ -1843,7 +1851,7 @@ def _drop_labels_or_levels(self, keys, axis: AxisInt = 0): else: # Drop the last level of Index by replacing with # a RangeIndex - dropped.columns = RangeIndex(dropped.columns.size) + dropped.columns = default_index(dropped.columns.size) # Handle dropping index labels if labels_to_drop: @@ -3200,7 +3208,7 @@ class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' Dimensions: (date: 2, animal: 2) Coordinates: - * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * date (date) datetime64[s] 2018-01-01 2018-01-02 * animal (animal) object 'falcon' 'parrot' Data variables: speed (date, animal) int64 350 18 361 15 @@ -4301,6 +4309,8 @@ def _check_copy_deprecation(copy): stacklevel=find_stack_level(), ) + # issue 58667 + @deprecate_kwarg("method", None) @final def reindex_like( self, @@ -4328,6 +4338,8 @@ def reindex_like( Please note: this is only applicable to DataFrames/Series with a monotonically increasing/decreasing index. + .. deprecated:: 3.0.0 + * None (default): don't fill gaps * pad / ffill: propagate last valid observation forward to next valid @@ -5527,7 +5539,7 @@ def head(self, n: int = 5) -> Self: it returns an empty object. When ``n`` is negative, it returns all rows except the last ``|n|`` rows, mirroring the behavior of ``df[:n]``. - If n is larger than the number of rows, this function returns all rows. + If ``n`` is larger than the number of rows, this function returns all rows. Parameters ---------- @@ -5615,7 +5627,7 @@ def tail(self, n: int = 5) -> Self: For negative values of `n`, this function returns all rows except the first `|n|` rows, equivalent to ``df[|n|:]``. - If n is larger than the number of rows, this function returns all rows. + If ``n`` is larger than the number of rows, this function returns all rows. Parameters ---------- @@ -6181,7 +6193,7 @@ def dtypes(self): >>> df.dtypes float float64 int int64 - datetime datetime64[ns] + datetime datetime64[s] string object dtype: object """ @@ -6360,7 +6372,7 @@ def astype( # TODO(EA2D): special case not needed with 2D EAs dtype = pandas_dtype(dtype) if isinstance(dtype, ExtensionDtype) and all( - arr.dtype == dtype for arr in self._mgr.arrays + block.values.dtype == dtype for block in self._mgr.blocks ): return self.copy(deep=False) # GH 18099/22869: columnwise conversion to extension dtype @@ -6622,7 +6634,7 @@ def convert_dtypes( dtype_backend: DtypeBackend = "numpy_nullable", ) -> Self: """ - Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``. + Convert columns from numpy dtypes to the best dtypes that support ``pd.NA``. Parameters ---------- @@ -6639,13 +6651,13 @@ def convert_dtypes( If `convert_integer` is also True, preference will be give to integer dtypes if the floats can be faithfully casted to integers. dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + Back-end data type applied to the resultant :class:`DataFrame` or + :class:`Series` (still experimental). Behaviour is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). + or :class:`Series` (default). * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + DataFrame or Series. .. versionadded:: 2.0 @@ -9258,7 +9270,9 @@ def compare( # reorder axis to keep things organized indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + np.arange(diff.shape[axis]) + .reshape([2, diff.shape[axis] // 2]) + .T.reshape(-1) ) diff = diff.take(indices, axis=axis) @@ -10052,7 +10066,7 @@ def shift( fill_value : object, optional The scalar value to use for newly introduced missing values. the default depends on the dtype of `self`. - For numeric data, ``np.nan`` is used. + For Boolean and numeric NumPy data types, ``np.nan`` is used. For datetime, timedelta, or period data, etc. :attr:`NaT` is used. For extension dtypes, ``self.dtype.na_value`` is used. suffix : str, optional @@ -10640,10 +10654,10 @@ def tz_localize( dates forward or backward with a timedelta object or `'shift_forward'` or `'shift_backward'`. - >>> s = pd.Series( - ... range(2), - ... index=pd.DatetimeIndex(["2015-03-29 02:30:00", "2015-03-29 03:30:00"]), + >>> dti = pd.DatetimeIndex( + ... ["2015-03-29 02:30:00", "2015-03-29 03:30:00"], dtype="M8[ns]" ... ) + >>> s = pd.Series(range(2), index=dti) >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward") 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 @@ -11135,9 +11149,9 @@ def _logical_func( if ( self.ndim > 1 and axis == 1 - and len(self._mgr.arrays) > 1 + and len(self._mgr.blocks) > 1 # TODO(EA2D): special-case not needed - and all(x.ndim == 2 for x in self._mgr.arrays) + and all(block.values.ndim == 2 for block in self._mgr.blocks) and not kwargs ): # Fastpath avoiding potentially expensive transpose @@ -12558,7 +12572,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "median": base_doc = _num_doc desc = "Return the median of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -12599,7 +12613,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "mean": base_doc = _num_doc desc = "Return the mean of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -12747,6 +12761,7 @@ def make_doc(name: str, ndim: int) -> str: a 0.0 dtype: float64""" kwargs = {"min_count": ""} + elif name == "kurt": base_doc = _num_doc desc = ( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a048d11d0b4d..eb334e0e57493 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,6 +21,7 @@ Union, cast, ) +import warnings import numpy as np @@ -32,6 +33,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -122,6 +124,10 @@ class NamedAgg(NamedTuple): Function to apply to the provided column. If string, the name of a built-in pandas function. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) @@ -387,7 +393,7 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: raise SpecificationError("nested renamer is not supported") if any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg) else: # list of functions / function names columns = (com.get_callable_name(f) or f for f in arg) @@ -680,7 +686,8 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: b 1 dtype: int64 """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) @@ -1206,7 +1213,7 @@ def idxmin(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmin() a 2023-01-01 b 2023-02-01 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmin", skipna=skipna) @@ -1259,7 +1266,7 @@ def idxmax(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmax() a 2023-01-15 b 2023-02-15 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmax", skipna=skipna) @@ -2077,7 +2084,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: obj = self._obj_with_exclusions columns = obj.columns - sgbs = [ + sgbs = ( SeriesGroupBy( obj.iloc[:, i], selection=colname, @@ -2086,7 +2093,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: observed=self.observed, ) for i, colname in enumerate(obj.columns) - ] + ) results = [func(sgb) for sgb in sgbs] if not len(results): @@ -2726,6 +2733,8 @@ def corrwith( """ Compute pairwise correlation. + .. deprecated:: 3.0.0 + Pairwise correlation is computed between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first aligned along both axes before computing the @@ -2785,6 +2794,11 @@ def corrwith( 2 0.755929 NaN 3 0.576557 NaN """ + warnings.warn( + "DataFrameGroupBy.corrwith is deprecated", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "corrwith", other=other, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f44ef8c4dbbfa..d45c891d6413b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,6 +11,7 @@ class providing the base-class of operations. from collections.abc import ( Hashable, + Iterable, Iterator, Mapping, Sequence, @@ -127,7 +128,6 @@ class providing the base-class of operations. from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, default_index, ) from pandas.core.internals.blocks import ensure_block_shape @@ -758,7 +758,7 @@ def get_converter(s): ) raise ValueError(msg) from err - converters = [get_converter(s) for s in index_sample] + converters = (get_converter(s) for s in index_sample) names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) else: @@ -1263,7 +1263,7 @@ def _set_result_index_ordered( if self._grouper.has_dropped_na: # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex - result = result.reindex(RangeIndex(len(index)), axis=0) + result = result.reindex(default_index(len(index)), axis=0) result = result.set_axis(index, axis=0) return result @@ -1333,7 +1333,7 @@ def _wrap_aggregated_output( # enforced in __init__ result = self._insert_inaxis_grouper(result, qs=qs) result = result._consolidate() - result.index = RangeIndex(len(result)) + result.index = default_index(len(result)) else: index = self._grouper.result_index @@ -1359,7 +1359,7 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, ngroups = self._grouper.group_info + ngroups = self._grouper.ngroups sorted_index = self._grouper.result_ilocs sorted_ids = self._grouper._sorted_ids @@ -1875,24 +1875,40 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): else: # i.e. func in base.reduction_kernels + if self.observed: + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) - # GH#30918 Use _transform_fast only when we know func is an aggregation - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - with com.temp_setattr(self, "as_index", True): - # GH#49834 - result needs groups in the index for - # _wrap_transform_fast_result - if func in ["idxmin", "idxmax"]: - func = cast(Literal["idxmin", "idxmax"], func) - result = self._idxmax_idxmin(func, True, *args, **kwargs) - else: - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + with ( + com.temp_setattr(self, "observed", True), + com.temp_setattr(self, "_grouper", self._grouper.observed_grouper), + ): + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + + @final + def _reduction_kernel_transform( + self, func, *args, engine=None, engine_kwargs=None, **kwargs + ): + # GH#30918 Use _transform_fast only when we know func is an aggregation + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) - return self._wrap_transform_fast_result(result) + return self._wrap_transform_fast_result(result) @final def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: @@ -1952,7 +1968,8 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2168,7 +2185,8 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups mask = ids != -1 is_series = data.ndim == 1 @@ -2629,7 +2647,7 @@ def _value_counts( } if isinstance(obj, Series): _name = obj.name - keys = [] if _name in in_axis_names else [obj] + keys: Iterable[Series] = [] if _name in in_axis_names else [obj] else: unique_cols = set(obj.columns) if subset is not None: @@ -2649,12 +2667,12 @@ def _value_counts( else: subsetted = unique_cols - keys = [ + keys = ( # Can't use .values because the column label needs to be preserved obj.iloc[:, idx] for idx, _name in enumerate(obj.columns) if _name not in in_axis_names and _name in subsetted - ] + ) groupings = list(self._grouper.groupings) for key in keys: @@ -3823,7 +3841,8 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups col_func = partial( libgroupby.group_fillna_indexer, @@ -4344,7 +4363,8 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups if self.dropna: # splitter drops NA groups, we need to do the same ids = ids[ids >= 0] @@ -5021,7 +5041,8 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2d10bd5d00eb2..5f680de77649f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -34,6 +34,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, + default_index, ) from pandas.core.series import Series @@ -668,6 +669,28 @@ def groups(self) -> dict[Hashable, Index]: cats = Categorical.from_codes(codes, uniques, validate=False) return self._index.groupby(cats) + @property + def observed_grouping(self) -> Grouping: + if self._observed: + return self + + return self._observed_grouping + + @cache_readonly + def _observed_grouping(self) -> Grouping: + grouping = Grouping( + self._index, + self._orig_grouper, + obj=self.obj, + level=self.level, + sort=self._sort, + observed=True, + in_axis=self.in_axis, + dropna=self._dropna, + uniques=self._uniques, + ) + return grouping + def get_grouper( obj: NDFrameT, @@ -879,7 +902,7 @@ def is_in_obj(gpr) -> bool: if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") if len(groupings) == 0: - groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + groupings.append(Grouping(default_index(0), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index effa94b1606bd..58c27d80ea99a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -73,7 +73,6 @@ Generator, Hashable, Iterator, - Sequence, ) from pandas.core.generic import NDFrame @@ -581,14 +580,14 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: Sequence[grouper.Grouping], + groupings: list[grouper.Grouping], sort: bool = True, dropna: bool = True, ) -> None: assert isinstance(axis, Index), axis self.axis = axis - self._groupings: list[grouper.Grouping] = list(groupings) + self._groupings = groupings self._sort = sort self.dropna = dropna @@ -596,10 +595,6 @@ def __init__( def groupings(self) -> list[grouper.Grouping]: return self._groupings - @property - def shape(self) -> Shape: - return tuple(ping.ngroups for ping in self.groupings) - def __iter__(self) -> Iterator[Hashable]: return iter(self.indices) @@ -628,11 +623,15 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter: ------- Generator yielding subsetted objects """ - ids, ngroups = self.group_info - return _get_splitter( + if isinstance(data, Series): + klass: type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass( data, - ids, - ngroups, + self.ngroups, sorted_ids=self._sorted_ids, sort_idx=self.result_ilocs, ) @@ -692,7 +691,8 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, ngroups = self.group_info + ids = self.ids + ngroups = self.ngroups out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -729,12 +729,6 @@ def has_dropped_na(self) -> bool: """ return bool((self.ids < 0).any()) - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - result_index, ids = self.result_index_and_ids - ngroups = len(result_index) - return ids, ngroups - @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis @@ -823,6 +817,19 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: return result_index, ids + @property + def observed_grouper(self) -> BaseGrouper: + if all(ping._observed for ping in self.groupings): + return self + + return self._observed_grouper + + @cache_readonly + def _observed_grouper(self) -> BaseGrouper: + groupings = [ping.observed_grouping for ping in self.groupings] + grouper = BaseGrouper(self.axis, groupings, sort=self._sort, dropna=self.dropna) + return grouper + def _ob_index_and_ids( self, levels: list[Index], @@ -1110,10 +1117,6 @@ def indices(self): i = bin return indices - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - return self.ids, self.ngroups - @cache_readonly def codes(self) -> list[npt.NDArray[np.intp]]: return [self.ids] @@ -1154,6 +1157,10 @@ def groupings(self) -> list[grouper.Grouping]: ) return [ping] + @property + def observed_grouper(self) -> BinGrouper: + return self + def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): @@ -1174,29 +1181,25 @@ class DataSplitter(Generic[NDFrameT]): def __init__( self, data: NDFrameT, - labels: npt.NDArray[np.intp], ngroups: int, *, sort_idx: npt.NDArray[np.intp], sorted_ids: npt.NDArray[np.intp], ) -> None: self.data = data - self.labels = ensure_platform_int(labels) # _should_ already be np.intp self.ngroups = ngroups self._slabels = sorted_ids self._sort_idx = sort_idx def __iter__(self) -> Iterator: - sdata = self._sorted_data - if self.ngroups == 0: # we are inside a generator, rather than raise StopIteration # we merely return signal the end return starts, ends = lib.generate_slices(self._slabels, self.ngroups) - + sdata = self._sorted_data for start, end in zip(starts, ends): yield self._chop(sdata, slice(start, end)) @@ -1224,20 +1227,3 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: mgr = sdata._mgr.get_slice(slice_obj, axis=1) df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) return df.__finalize__(sdata, method="groupby") - - -def _get_splitter( - data: NDFrame, - labels: npt.NDArray[np.intp], - ngroups: int, - *, - sort_idx: npt.NDArray[np.intp], - sorted_ids: npt.NDArray[np.intp], -) -> DataSplitter: - if isinstance(data, Series): - klass: type[DataSplitter] = SeriesSplitter - else: - # i.e. DataFrame - klass = FrameSplitter - - return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index d108f840a1b4f..083e86500a210 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -48,6 +48,25 @@ class BaseIndexer: """ Base class for window bounds calculations. + Parameters + ---------- + index_array : np.ndarray, default None + Array-like structure representing the indices for the data points. + If None, the default indices are assumed. This can be useful for + handling non-uniform indices in data, such as in time series + with irregular timestamps. + window_size : int, default 0 + Size of the moving window. This is the number of observations used + for calculating the statistic. The default is to consider all + observations within the window. + **kwargs + Additional keyword arguments passed to the subclass's methods. + + See Also + -------- + DataFrame.rolling : Provides rolling window calculations on dataframe. + Series.rolling : Provides rolling window calculations on series. + Examples -------- >>> from pandas.api.indexers import BaseIndexer @@ -296,6 +315,26 @@ class FixedForwardWindowIndexer(BaseIndexer): """ Creates window boundaries for fixed-length windows that include the current row. + Parameters + ---------- + index_array : np.ndarray, default None + Array-like structure representing the indices for the data points. + If None, the default indices are assumed. This can be useful for + handling non-uniform indices in data, such as in time series + with irregular timestamps. + window_size : int, default 0 + Size of the moving window. This is the number of observations used + for calculating the statistic. The default is to consider all + observations within the window. + **kwargs + Additional keyword arguments passed to the subclass's methods. + + See Also + -------- + DataFrame.rolling : Provides rolling window calculations. + api.indexers.VariableWindowIndexer : Calculate window bounds based on + variable-sized windows. + Examples -------- >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 3dcd1fedc8d64..3cb51f7447677 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -489,10 +489,20 @@ def components(self) -> DataFrame: """ Return a Dataframe of the components of the Timedeltas. + Each row of the DataFrame corresponds to a Timedelta in the original + Series and contains the individual components (days, hours, minutes, + seconds, milliseconds, microseconds, nanoseconds) of the Timedelta. + Returns ------- DataFrame + See Also + -------- + TimedeltaIndex.components : Return a DataFrame of the individual resolution + components of the Timedeltas. + Series.dt.total_seconds : Return the total number of seconds in the duration. + Examples -------- >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="s")) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index c5e3f3a50e10d..5144e647e73b4 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -130,7 +130,7 @@ def _get_combined_index( # TODO: handle index names! indexes = _get_distinct_objs(indexes) if len(indexes) == 0: - index = Index([]) + index: Index = default_index(0) elif len(indexes) == 1: index = indexes[0] elif intersect: @@ -212,20 +212,25 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: if kind == "special": result = indexes[0] - dtis = [x for x in indexes if isinstance(x, DatetimeIndex)] - dti_tzs = [x for x in dtis if x.tz is not None] - if len(dti_tzs) not in [0, len(dtis)]: + num_dtis = 0 + num_dti_tzs = 0 + for idx in indexes: + if isinstance(idx, DatetimeIndex): + num_dtis += 1 + if idx.tz is not None: + num_dti_tzs += 1 + if num_dti_tzs not in [0, num_dtis]: # TODO: this behavior is not tested (so may not be desired), # but is kept in order to keep behavior the same when # deprecating union_many # test_frame_from_dict_with_mixed_indexes raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - if len(dtis) == len(indexes): + if num_dtis == len(indexes): sort = True result = indexes[0] - elif len(dtis) > 1: + elif num_dtis > 1: # If we have mixed timezones, our casting behavior may depend on # the order of indexes, which we don't want. sort = False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 048362a28dfd7..15c318e5e9caf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -142,7 +142,7 @@ nanops, ops, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor import pandas.core.algorithms as algos from pandas.core.array_algos.putmask import ( setitem_datetimelike_compat, @@ -326,6 +326,8 @@ class Index(IndexOpsMixin, PandasObject): Parameters ---------- data : array-like (1-dimensional) + An array-like structure containing the data for the index. This could be a + Python list, a NumPy array, or a pandas Series. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Index. If not specified, this will be inferred from `data`. @@ -460,7 +462,7 @@ def _engine_type( _accessors = {"str"} - str = CachedAccessor("str", StringMethods) + str = Accessor("str", StringMethods) _references = None @@ -488,8 +490,6 @@ def __new__( if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references - is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) - # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) @@ -506,7 +506,7 @@ def __new__( elif is_ea_or_datetimelike_dtype(data_dtype): pass - elif isinstance(data, (np.ndarray, Index, ABCSeries)): + elif isinstance(data, (np.ndarray, ABCMultiIndex)): if isinstance(data, ABCMultiIndex): data = data._values @@ -516,7 +516,9 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - + elif isinstance(data, (ABCSeries, Index)): + # GH 56244: Avoid potential inference on object types + pass elif is_scalar(data): raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): @@ -569,19 +571,7 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - result = klass._simple_new(arr, name, refs=refs) - if dtype is None and is_pandas_object and data_dtype == np.object_: - if result.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Index " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - return result # type: ignore[return-value] + return klass._simple_new(arr, name, refs=refs) @classmethod def _ensure_array(cls, data, dtype, copy: bool): @@ -996,9 +986,16 @@ def ravel(self, order: str_t = "C") -> Self: """ Return a view on self. + Parameters + ---------- + order : {'K', 'A', 'C', 'F'}, default 'C' + Specify the memory layout of the view. This parameter is not + implemented currently. + Returns ------- Index + A view on self. See Also -------- @@ -1014,7 +1011,13 @@ def ravel(self, order: str_t = "C") -> Self: def view(self, cls=None): """ - Return a view on self. + Return a view of the Index with the specified dtype or a new Index instance. + + This method returns a view of the calling Index object if no arguments are + provided. If a dtype is specified through the `cls` argument, it attempts + to return a view of the Index with the specified dtype. Note that viewing + the Index as a different dtype reinterprets the underlying data, which can + lead to unexpected results for non-numeric or incompatible dtype conversions. Parameters ---------- @@ -1027,27 +1030,38 @@ def view(self, cls=None): Returns ------- - numpy.ndarray - A new view of the same data in memory. + Index or ndarray + A view of the Index. If `cls` is None, the returned object is an Index + view with the same dtype as the calling object. If a numeric `cls` is + specified an ndarray view with the new dtype is returned. + + Raises + ------ + ValueError + If attempting to change to a dtype in a way that is not compatible with + the original dtype's memory layout, for example, viewing an 'int64' Index + as 'str'. See Also -------- + Index.copy : Returns a copy of the Index. numpy.ndarray.view : Returns a new view of array with the same data. Examples -------- - >>> s = pd.Series([1, 2, 3], index=["1", "2", "3"]) - >>> s.index.view("object") - array(['1', '2', '3'], dtype=object) + >>> idx = pd.Index([-1, 0, 1]) + >>> idx.view() + Index([-1, 0, 1], dtype='int64') - >>> s = pd.Series([1, 2, 3], index=[-1, 0, 1]) - >>> s.index.view(np.int64) - array([-1, 0, 1]) - >>> s.index.view(np.float32) - array([ nan, nan, 0.e+00, 0.e+00, 1.e-45, 0.e+00], dtype=float32) - >>> s.index.view(np.uint64) + >>> idx.view(np.uint64) array([18446744073709551615, 0, 1], dtype=uint64) + + Viewing as 'int32' or 'float32' reinterprets the memory, which may lead to + unexpected behavior: + + >>> idx.view("float32") + array([ nan, nan, 0.e+00, 0.e+00, 1.e-45, 0.e+00], dtype=float32) """ # we need to see if we are subclassing an # index type here @@ -1799,6 +1813,38 @@ def _get_default_index_names( return names def _get_names(self) -> FrozenList: + """ + Get names on index. + + This method returns a FrozenList containing the names of the object. + It's primarily intended for internal use. + + Returns + ------- + FrozenList + A FrozenList containing the object's names, contains None if the object + does not have a name. + + See Also + -------- + Index.name : Index name as a string, or None for MultiIndex. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3], name="x") + >>> idx.names + FrozenList(['x']) + + >>> idx = pd.Index([1, 2, 3], name=("x", "y")) + >>> idx.names + FrozenList([('x', 'y')]) + + If the index does not have a name set: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx.names + FrozenList([None]) + """ return FrozenList((self.name,)) def _set_names(self, values, *, level=None) -> None: @@ -2576,7 +2622,7 @@ def isna(self) -> npt.NDArray[np.bool_]: ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.isna() array([False, True, True, True]) """ @@ -3082,7 +3128,7 @@ def _union(self, other: Index, sort: bool | None): # worth making this faster? a very unusual case value_set = set(lvals) - value_list.extend([x for x in rvals if x not in value_set]) + value_list.extend(x for x in rvals if x not in value_set) # If objects are unorderable, we must have object dtype. return np.array(value_list, dtype=object) @@ -3490,10 +3536,22 @@ def get_loc(self, key): Parameters ---------- key : label + The key to check its location if it is present in the index. Returns ------- int if unique index, slice if monotonic index, else mask + Integer location, slice or boolean mask. + + See Also + -------- + Index.get_slice_bound : Calculate slice bound that corresponds to + given label. + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Index.get_indexer_for : Returns an indexer even when non-unique. Examples -------- @@ -3935,25 +3993,6 @@ def _convert_slice_indexer(self, key: slice, kind: Literal["loc", "getitem"]): # TODO(GH#50617): once Series.__[gs]etitem__ is removed we should be able # to simplify this. - if lib.is_np_dtype(self.dtype, "f"): - # We always treat __getitem__ slicing as label-based - # translate to locations - if kind == "getitem" and is_index_slice and not start == stop and step != 0: - # exclude step=0 from the warning because it will raise anyway - # start/stop both None e.g. [:] or [::-1] won't change. - # exclude start==stop since it will be empty either way, or - # will be [:] or [::-1] which won't change - warnings.warn( - # GH#49612 - "The behavior of obj[i:j] with a float-dtype index is " - "deprecated. In a future version, this will be treated as " - "positional instead of label-based. For label-based slicing, " - "use obj.loc[i:j] instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.slice_indexer(start, stop, step) - if kind == "getitem": # called from the getitem slicers, validate that we are in fact integers if is_index_slice: @@ -4287,9 +4326,12 @@ def join( Parameters ---------- other : Index + The other index on which join is performed. how : {'left', 'right', 'inner', 'outer'} level : int or level name, default None + It is either the integer position or the name of the level. return_indexers : bool, default False + Whether to return the indexers or not for both the index objects. sort : bool, default False Sort the join keys lexicographically in the result Index. If False, the order of the join keys depends on the join type (how keyword). @@ -4297,6 +4339,14 @@ def join( Returns ------- join_index, (left_indexer, right_indexer) + The new index. + + See Also + -------- + DataFrame.join : Join columns with `other` DataFrame either on index + or on a key. + DataFrame.merge : Merge DataFrame or named Series objects with a + database-style join. Examples -------- @@ -4304,6 +4354,9 @@ def join( >>> idx2 = pd.Index([4, 5, 6]) >>> idx1.join(idx2, how="outer") Index([1, 2, 3, 4, 5, 6], dtype='int64') + >>> idx1.join(other=idx2, how="outer", return_indexers=True) + (Index([1, 2, 3, 4, 5, 6], dtype='int64'), + array([ 0, 1, 2, -1, -1, -1]), array([-1, -1, -1, 0, 1, 2])) """ other = ensure_index(other) sort = sort or how == "outer" @@ -7555,8 +7608,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: list A list representing the unanimous 'names' found. """ - name_tups = [tuple(i.names) for i in indexes] - name_sets = [{*ns} for ns in zip_longest(*name_tups)] + name_tups = (tuple(i.names) for i in indexes) + name_sets = ({*ns} for ns in zip_longest(*name_tups)) names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) return names diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 78f04f57029b1..c276750314a34 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -242,7 +242,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> idx DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) """ _typ = "datetimeindex" @@ -473,7 +473,8 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: Examples -------- >>> idx = pd.DatetimeIndex( - ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"] + ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"], + ... dtype="M8[ns]", ... ) >>> idx DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], @@ -1069,6 +1070,13 @@ def bdate_range( Returns ------- DatetimeIndex + Fixed frequency DatetimeIndex. + + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex. Notes ----- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 36f181110eccd..359cdf880937b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -621,13 +621,27 @@ def get_loc(self, key) -> int | slice | np.ndarray: """ Get integer location, slice or boolean mask for requested label. + The `get_loc` method is used to retrieve the integer index, a slice for + slicing objects, or a boolean mask indicating the presence of the label + in the `IntervalIndex`. + Parameters ---------- key : label + The value or range to find in the IntervalIndex. Returns ------- int if unique index, slice if monotonic index, else mask + The position or positions found. This could be a single + number, a range, or an array of true/false values + indicating the position(s) of the label. + + See Also + -------- + IntervalIndex.get_indexer_non_unique : Compute indexer and + mask for new index given the current index. + Index.get_loc : Similar method in the base Index class. Examples -------- @@ -828,18 +842,149 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: @cache_readonly def left(self) -> Index: + """ + Return left bounds of the intervals in the IntervalIndex. + + The left bounds of each interval in the IntervalIndex are + returned as an Index. The datatype of the left bounds is the + same as the datatype of the endpoints of the intervals. + + Returns + ------- + Index + An Index containing the left bounds of the intervals. + + See Also + -------- + IntervalIndex.right : Return the right bounds of the intervals + in the IntervalIndex. + IntervalIndex.mid : Return the mid-point of the intervals in + the IntervalIndex. + IntervalIndex.length : Return the length of the intervals in + the IntervalIndex. + + Examples + -------- + >>> iv_idx = pd.IntervalIndex.from_arrays([1, 2, 3], [4, 5, 6], closed="right") + >>> iv_idx.left + Index([1, 2, 3], dtype='int64') + + >>> iv_idx = pd.IntervalIndex.from_tuples( + ... [(1, 4), (2, 5), (3, 6)], closed="left" + ... ) + >>> iv_idx.left + Index([1, 2, 3], dtype='int64') + """ return Index(self._data.left, copy=False) @cache_readonly def right(self) -> Index: + """ + Return right bounds of the intervals in the IntervalIndex. + + The right bounds of each interval in the IntervalIndex are + returned as an Index. The datatype of the right bounds is the + same as the datatype of the endpoints of the intervals. + + Returns + ------- + Index + An Index containing the right bounds of the intervals. + + See Also + -------- + IntervalIndex.left : Return the left bounds of the intervals + in the IntervalIndex. + IntervalIndex.mid : Return the mid-point of the intervals in + the IntervalIndex. + IntervalIndex.length : Return the length of the intervals in + the IntervalIndex. + + Examples + -------- + >>> iv_idx = pd.IntervalIndex.from_arrays([1, 2, 3], [4, 5, 6], closed="right") + >>> iv_idx.right + Index([4, 5, 6], dtype='int64') + + >>> iv_idx = pd.IntervalIndex.from_tuples( + ... [(1, 4), (2, 5), (3, 6)], closed="left" + ... ) + >>> iv_idx.right + Index([4, 5, 6], dtype='int64') + """ return Index(self._data.right, copy=False) @cache_readonly def mid(self) -> Index: + """ + Return the midpoint of each interval in the IntervalIndex as an Index. + + Each midpoint is calculated as the average of the left and right bounds + of each interval. The midpoints are returned as a pandas Index object. + + Returns + ------- + pandas.Index + An Index containing the midpoints of each interval. + + See Also + -------- + IntervalIndex.left : Return the left bounds of the intervals + in the IntervalIndex. + IntervalIndex.right : Return the right bounds of the intervals + in the IntervalIndex. + IntervalIndex.length : Return the length of the intervals in + the IntervalIndex. + + Notes + ----- + The midpoint is the average of the interval bounds, potentially resulting + in a floating-point number even if bounds are integers. The returned Index + will have a dtype that accurately holds the midpoints. This computation is + the same regardless of whether intervals are open or closed. + + Examples + -------- + >>> iv_idx = pd.IntervalIndex.from_arrays([1, 2, 3], [4, 5, 6]) + >>> iv_idx.mid + Index([2.5, 3.5, 4.5], dtype='float64') + + >>> iv_idx = pd.IntervalIndex.from_tuples([(1, 4), (2, 5), (3, 6)]) + >>> iv_idx.mid + Index([2.5, 3.5, 4.5], dtype='float64') + """ return Index(self._data.mid, copy=False) @property def length(self) -> Index: + """ + Calculate the length of each interval in the IntervalIndex. + + This method returns a new Index containing the lengths of each interval + in the IntervalIndex. The length of an interval is defined as the difference + between its end and its start. + + Returns + ------- + Index + An Index containing the lengths of each interval. + + See Also + -------- + Interval.length : Return the length of the Interval. + + Examples + -------- + >>> intervals = pd.IntervalIndex.from_arrays( + ... [1, 2, 3], [4, 5, 6], closed="right" + ... ) + >>> intervals.length + Index([3, 3, 3], dtype='int64') + + >>> intervals = pd.IntervalIndex.from_tuples([(1, 5), (6, 10), (11, 15)]) + >>> intervals.length + Index([4, 4, 4], dtype='int64') + """ return Index(self._data.length, copy=False) # -------------------------------------------------------------------- @@ -993,6 +1138,7 @@ def interval_range( Returns ------- IntervalIndex + Object with a fixed frequency. See Also -------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a5bcf49c5490b..63908ada0c73e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -209,8 +209,12 @@ class MultiIndex(Index): level). names : optional sequence of objects Names for each of the index levels. (name is accepted for compat). + dtype : Numpy dtype or pandas type, optional + Data type for the MultiIndex. copy : bool, default False Copy the meta-data. + name : Label + Kept for compatibility with 1-dimensional Index. Should not be used. verify_integrity : bool, default True Check that the levels/codes are consistent and valid. @@ -771,6 +775,11 @@ def dtypes(self) -> Series: """ Return the dtypes as a Series for the underlying MultiIndex. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Series.dtypes : Return the data type of the underlying Series. + Examples -------- >>> idx = pd.MultiIndex.from_product( @@ -826,6 +835,12 @@ def levels(self) -> FrozenList: it filters out all rows of the level C, MultiIndex.levels will still return A, B, C. + See Also + -------- + MultiIndex.codes : The codes of the levels in the MultiIndex. + MultiIndex.get_level_values : Return vector of label values for requested + level. + Examples -------- >>> index = pd.MultiIndex.from_product( @@ -1016,6 +1031,13 @@ def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. + See Also + -------- + MultiIndex.levels : Get the levels of the MultiIndex. + MultiIndex.codes : Get the codes of the MultiIndex. + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) @@ -1134,6 +1156,12 @@ def set_codes( new index (of same type and class...etc) or None The same type as the caller or None if ``inplace=True``. + See Also + -------- + MultiIndex.set_levels : Set new levels on MultiIndex. + MultiIndex.codes : Get the codes of the levels in the MultiIndex. + MultiIndex.levels : Get the levels of the MultiIndex. + Examples -------- >>> idx = pd.MultiIndex.from_tuples( @@ -1387,7 +1415,7 @@ def _formatter_func(self, tup): """ Formats each item in tup according to its level's formatter function. """ - formatter_funcs = [level._formatter_func for level in self.levels] + formatter_funcs = (level._formatter_func for level in self.levels) return tuple(func(val) for func, val in zip(formatter_funcs, tup)) def _get_values_for_csv( @@ -1537,7 +1565,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: if level is None: level = range(self.nlevels) else: - level = [self._get_level_number(lev) for lev in level] + level = (self._get_level_number(lev) for lev in level) # set the name for lev, name in zip(level, names): @@ -1656,7 +1684,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value, downcast=None): + def fillna(self, value): """ fillna is not implemented for MultiIndex """ @@ -3590,6 +3618,11 @@ def truncate(self, before=None, after=None) -> MultiIndex: MultiIndex The truncated MultiIndex. + See Also + -------- + DataFrame.truncate : Truncate a DataFrame before and after some index values. + Series.truncate : Truncate a Series before and after some index values. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a", "b", "c"], ["x", "y", "z"]]) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8af5a56f43c57..29039ffd0217e 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -273,6 +273,11 @@ def timedelta_range( TimedeltaIndex Fixed frequency, with day as the default. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b3ae53272cae4..8d1239ff71174 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -159,7 +159,7 @@ def iloc(self) -> _iLocIndexer: """ Purely integer-location based indexing for selection by position. - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0 Returning a tuple from a callable is deprecated. @@ -905,7 +905,7 @@ def __setitem__(self, key, value) -> None: key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: maybe_callable = com.apply_if_callable(key, self.obj) - key = self._check_deprecated_callable_usage(key, maybe_callable) + key = self._raise_callable_usage(key, maybe_callable) indexer = self._get_setitem_indexer(key) self._has_valid_setitem_indexer(key) @@ -1164,14 +1164,11 @@ def _contains_slice(x: object) -> bool: def _convert_to_indexer(self, key, axis: AxisInt): raise AbstractMethodError(self) - def _check_deprecated_callable_usage(self, key: Any, maybe_callable: T) -> T: + def _raise_callable_usage(self, key: Any, maybe_callable: T) -> T: # GH53533 if self.name == "iloc" and callable(key) and isinstance(maybe_callable, tuple): - warnings.warn( - "Returning a tuple from a callable with iloc " - "is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + "Returning a tuple from a callable with iloc is not allowed.", ) return maybe_callable @@ -1189,7 +1186,7 @@ def __getitem__(self, key): axis = self.axis or 0 maybe_callable = com.apply_if_callable(key, self.obj) - maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable) + maybe_callable = self._raise_callable_usage(key, maybe_callable) return self._getitem_axis(maybe_callable, axis=axis) def _is_scalar_access(self, key: tuple): @@ -1807,10 +1804,10 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None: # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: + if not take_split_path and len(self.obj._mgr.blocks) and self.ndim > 1: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - arr = self.obj._mgr.arrays[0] + arr = self.obj._mgr.blocks[0].values take_split_path = not can_hold_element( arr, extract_array(val, extract_numpy=True) ) @@ -2375,8 +2372,7 @@ def ravel(i): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): - # TODO(CoW): copy shouldn't be needed here - ser_values = ser.reindex(obj.axes[0][indexer[0]]).copy()._values + ser_values = ser.reindex(obj.axes[0][indexer[0]])._values # single indexer if len(indexer) > 1 and not multiindex_indexer: @@ -2444,7 +2440,7 @@ def _align_frame(self, indexer, df: DataFrame) -> DataFrame: ax = self.obj.axes[i] if is_sequence(ix) or isinstance(ix, slice): if isinstance(ix, np.ndarray): - ix = ix.ravel() + ix = ix.reshape(-1) if idx is None: idx = ax[ix] elif cols is None: diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 89c8a4a27ca31..45758379e0bd6 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -6,58 +6,8 @@ ) __all__ = [ - "Block", - "DatetimeTZBlock", - "ExtensionBlock", "make_block", "BlockManager", "SingleBlockManager", "concatenate_managers", ] - - -def __getattr__(name: str): - # GH#55139 - import warnings - - if name == "create_block_manager_from_blocks": - # GH#33892 - warnings.warn( - f"{name} is deprecated and will be removed in a future version. " - "Use public APIs instead.", - DeprecationWarning, - # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 - # on hard-coding stacklevel - stacklevel=2, - ) - from pandas.core.internals.managers import create_block_manager_from_blocks - - return create_block_manager_from_blocks - - if name in [ - "Block", - "ExtensionBlock", - "DatetimeTZBlock", - ]: - warnings.warn( - f"{name} is deprecated and will be removed in a future version. " - "Use public APIs instead.", - DeprecationWarning, - # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 - # on hard-coding stacklevel - stacklevel=2, - ) - if name == "DatetimeTZBlock": - from pandas.core.internals.blocks import DatetimeTZBlock - - return DatetimeTZBlock - elif name == "ExtensionBlock": - from pandas.core.internals.blocks import ExtensionBlock - - return ExtensionBlock - else: - from pandas.core.internals.blocks import Block - - return Block - - raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'") diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index ef25d7ed5ae9e..04944db2ebd9c 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -10,6 +10,7 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings import numpy as np @@ -87,15 +88,21 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ + warnings.warn( + # GH#56815 + "make_block is deprecated and will be removed in a future version. " + "Use pd.api.internals.create_dataframe_from_blocks or " + "(recommended) higher-level public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + if dtype is not None: dtype = pandas_dtype(dtype) values, dtype = extract_pandas_array(values, dtype, ndim) - from pandas.core.internals.blocks import ( - DatetimeTZBlock, - ExtensionBlock, - ) + from pandas.core.internals.blocks import ExtensionBlock if klass is ExtensionBlock and isinstance(values.dtype, PeriodDtype): # GH-44681 changed PeriodArray to be stored in the 2D @@ -107,16 +114,6 @@ def make_block( dtype = dtype or values.dtype klass = get_block_type(dtype) - elif klass is DatetimeTZBlock and not isinstance(values.dtype, DatetimeTZDtype): - # pyarrow calls get here - values = DatetimeArray._simple_new( - # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has - # incompatible type "Union[ExtensionDtype, dtype[Any], None]"; - # expected "Union[dtype[datetime64], DatetimeTZDtype]" - values, - dtype=dtype, # type: ignore[arg-type] - ) - if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) @@ -146,48 +143,3 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int else: ndim = values.ndim return ndim - - -def __getattr__(name: str): - # GH#55139 - import warnings - - if name in [ - "Block", - "ExtensionBlock", - "DatetimeTZBlock", - "create_block_manager_from_blocks", - ]: - # GH#33892 - warnings.warn( - f"{name} is deprecated and will be removed in a future version. " - "Use public APIs instead.", - DeprecationWarning, - # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 - # on hard-coding stacklevel - stacklevel=2, - ) - - if name == "create_block_manager_from_blocks": - from pandas.core.internals.managers import create_block_manager_from_blocks - - return create_block_manager_from_blocks - - elif name == "Block": - from pandas.core.internals.blocks import Block - - return Block - - elif name == "DatetimeTZBlock": - from pandas.core.internals.blocks import DatetimeTZBlock - - return DatetimeTZBlock - - elif name == "ExtensionBlock": - from pandas.core.internals.blocks import ExtensionBlock - - return ExtensionBlock - - raise AttributeError( - f"module 'pandas.core.internals.api' has no attribute '{name}'" - ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 28d3292a1c65b..cffb1f658a640 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1863,6 +1863,8 @@ def fillna( ) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Block.fillna handles coercion (test_fillna_interval) + if limit is not None: + raise ValueError("limit must be None") return super().fillna( value=value, limit=limit, @@ -2151,14 +2153,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): values: DatetimeArray | TimedeltaArray -class DatetimeTZBlock(DatetimeLikeBlock): - """implement a datetime64 block with a tz attribute""" - - values: DatetimeArray - - __slots__ = () - - # ----------------------------------------------------------------- # Constructor Helpers @@ -2205,7 +2199,7 @@ def get_block_type(dtype: DtypeObj) -> type[Block]: cls : class, subclass of Block """ if isinstance(dtype, DatetimeTZDtype): - return DatetimeTZBlock + return DatetimeLikeBlock elif isinstance(dtype, PeriodDtype): return NDArrayBackedExtensionBlock elif isinstance(dtype, ExtensionDtype): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index cea52bf8c91b2..23572975a1112 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -192,6 +192,7 @@ def ndarray_to_mgr( ) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray + infer_object = not isinstance(values, (ABCSeries, Index, ExtensionArray)) if isinstance(values, ABCSeries): if columns is None: @@ -287,15 +288,14 @@ def ndarray_to_mgr( # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): + if dtype is None and infer_object and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): - dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] block_values = [ - new_block_2d(dvals_list[n], placement=BlockPlacement(n)) - for n in range(len(dvals_list)) + new_block_2d(ensure_block_shape(dval, 2), placement=BlockPlacement(n)) + for n, dval in enumerate(maybe_datetime) ] else: bp = BlockPlacement(slice(len(columns))) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7c1bcbec1d3f2..79cba9275a119 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -249,7 +249,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: - axes = [Index([])] + self.axes[1:] + axes = [default_index(0)] + self.axes[1:] # preserve dtype if possible if self.ndim == 1: @@ -353,6 +353,8 @@ def arrays(self) -> list[ArrayLike]: Warning! The returned arrays don't handle Copy-on-Write, so this should be used with caution (only in read-mode). """ + # TODO: Deprecate, usage in Dask + # https://github.com/dask/dask/blob/484fc3f1136827308db133cd256ba74df7a38d8c/dask/base.py#L1312 return [blk.values for blk in self.blocks] def __repr__(self) -> str: @@ -819,11 +821,13 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( - indexer, - fill_value=fill_value, - only_slice=only_slice, - use_na_proxy=use_na_proxy, + new_blocks = list( + self._slice_take_blocks_ax0( + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, + ) ) else: new_blocks = [ @@ -855,7 +859,7 @@ def _slice_take_blocks_ax0( *, use_na_proxy: bool = False, ref_inplace_op: bool = False, - ) -> list[Block]: + ) -> Generator[Block, None, None]: """ Slice/take blocks along axis=0. @@ -873,9 +877,9 @@ def _slice_take_blocks_ax0( ref_inplace_op: bool, default False Don't track refs if True because we operate inplace - Returns - ------- - new_blocks : list of Block + Yields + ------ + Block : New Block """ allow_fill = fill_value is not lib.no_default @@ -890,9 +894,10 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [] + return bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + yield blk.getitem_block_columns(slobj, new_mgr_locs=bp) + return elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -900,25 +905,21 @@ def _slice_take_blocks_ax0( if not allow_fill and only_slice: # GH#33597 slice instead of take, so we get # views instead of copies - blocks = [ - blk.getitem_block_columns( + for i, ml in enumerate(slobj): + yield blk.getitem_block_columns( slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i), ref_inplace_op=ref_inplace_op, ) - for i, ml in enumerate(slobj) - ] - return blocks else: bp = BlockPlacement(slice(0, sllen)) - return [ - blk.take_nd( - slobj, - axis=0, - new_mgr_locs=bp, - fill_value=fill_value, - ) - ] + yield blk.take_nd( + slobj, + axis=0, + new_mgr_locs=bp, + fill_value=fill_value, + ) + return if sl_type == "slice": blknos = self.blknos[slobj] @@ -933,18 +934,15 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). - blocks = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: # If we've got here, fill_value was not lib.no_default - blocks.append( - self._make_na_block( - placement=mgr_locs, - fill_value=fill_value, - use_na_proxy=use_na_proxy, - ) + yield self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, ) else: blk = self.blocks[blkno] @@ -959,7 +957,7 @@ def _slice_take_blocks_ax0( for mgr_loc in mgr_locs: newblk = blk.copy(deep=deep) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) - blocks.append(newblk) + yield newblk else: # GH#32779 to avoid the performance penalty of copying, @@ -970,7 +968,7 @@ def _slice_take_blocks_ax0( if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) - blocks.append(nb) + yield nb elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -979,12 +977,10 @@ def _slice_take_blocks_ax0( bp = BlockPlacement(ml) nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) - blocks.append(nb) + yield nb else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) - - return blocks + yield nb def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False @@ -2068,7 +2064,7 @@ def array(self) -> ArrayLike: """ Quick access to the backing array of the Block. """ - return self.arrays[0] + return self.blocks[0].values # error: Cannot override writeable attribute with read-only property @property diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 283acaca2c117..02e7445f1d275 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -29,6 +29,8 @@ ) from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.indexes.api import default_index + if TYPE_CHECKING: from pandas._typing import ( DtypeObj, @@ -38,6 +40,7 @@ from pandas import ( DataFrame, + Index, Series, ) else: @@ -199,8 +202,6 @@ def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> No self.columns = columns def compute(self, method: str) -> DataFrame: - from pandas.core.api import Index - n = self.n frame = self.obj columns = self.columns @@ -227,7 +228,7 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: original_index = frame.index cur_frame = frame = frame.reset_index(drop=True) cur_n = n - indexer = Index([], dtype=np.int64) + indexer: Index = default_index(0) for i, column in enumerate(columns): # For each column we apply method to cur_frame[column]. diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 8063a52a02163..0ad6db0aefe9c 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -436,6 +436,7 @@ def make_flex_doc(op_name: str, typ: str) -> str: Parameters ---------- other : Series or scalar value + The second operand in this operation. level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level. diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d17e5b475ae57..7055201b5a1ee 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -171,7 +171,7 @@ def concat( Parameters ---------- objs : an iterable or mapping of Series or DataFrame objects - If a mapping is passed, the sorted keys will be used as the `keys` + If a mapping is passed, the keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. @@ -560,7 +560,7 @@ def get_result(self): # combine as columns in a frame else: - data = dict(zip(range(len(self.objs)), self.objs)) + data = dict(enumerate(self.objs)) # GH28330 Preserves subclassed objects through concat cons = sample._constructor_expanddim @@ -874,7 +874,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) - new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) + new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes) else: new_levels.append(new_index.unique()) single_codes = new_index.unique().get_indexer(new_index) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index b4720306094e9..294de2cf2fe1d 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -5,7 +5,10 @@ import numpy as np -from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.common import ( + is_iterator, + is_list_like, +) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import notna @@ -64,9 +67,10 @@ def melt( value_vars : scalar, tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. - var_name : scalar, default None + var_name : scalar, tuple, list, or ndarray, optional Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. + ``frame.columns.name`` or 'variable'. Must be a scalar if columns are a + MultiIndex. value_name : scalar, default 'value' Name to use for the 'value' column, can't be an existing column label. col_level : scalar, optional @@ -217,7 +221,16 @@ def melt( frame.columns.name if frame.columns.name is not None else "variable" ] elif is_list_like(var_name): - raise ValueError(f"{var_name=} must be a scalar.") + if isinstance(frame.columns, MultiIndex): + if is_iterator(var_name): + var_name = list(var_name) + if len(var_name) > len(frame.columns): + raise ValueError( + f"{var_name=} has {len(var_name)} items, " + f"but the dataframe columns only have {len(frame.columns)} levels." + ) + else: + raise ValueError(f"{var_name=} must be a scalar.") else: var_name = [var_name] diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e84c2135b82..2ce77ac19b9c5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -39,11 +39,7 @@ npt, ) from pandas.errors import MergeError -from pandas.util._decorators import ( - Appender, - Substitution, - cache_readonly, -) +from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype @@ -95,7 +91,6 @@ ensure_wrapped_if_datetimelike, extract_array, ) -from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index from pandas.core.sorting import ( get_group_index, @@ -133,8 +128,6 @@ _known = (np.ndarray, ExtensionArray, Index, ABCSeries) -@Substitution("\nleft : DataFrame or named Series") -@Appender(_merge_doc, indents=0) def merge( left: DataFrame | Series, right: DataFrame | Series, @@ -150,6 +143,210 @@ def merge( indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: + """ + Merge DataFrame or named Series objects with a database-style join. + + A named Series object is treated as a DataFrame with a single named column. + + The join is done on columns or indexes. If joining columns on + columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes + on indexes or indexes on a column or columns, the index will be passed on. + When performing a cross merge, no column specifications to merge on are + allowed. + + .. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + + Parameters + ---------- + left : DataFrame or named Series + First pandas object to merge. + right : DataFrame or named Series + Second pandas object to merge. + how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. + left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. + right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. + left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. + right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. + sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). + suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + copy : bool, default False + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 + indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + + validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Returns + ------- + DataFrame + A DataFrame of the two merged objects. + + See Also + -------- + merge_ordered : Merge with optional filling/interpolation. + merge_asof : Merge on nearest keys. + DataFrame.join : Similar method using indices. + + Examples + -------- + >>> df1 = pd.DataFrame( + ... {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} + ... ) + >>> df2 = pd.DataFrame( + ... {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]} + ... ) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey") + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2 with specified left and right suffixes + appended to any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=("_left", "_right")) + lkey value_left rkey value_right + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2, but raise an exception if the DataFrames have + any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=(False, False)) + Traceback (most recent call last): + ... + ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + + >>> df1 = pd.DataFrame({"a": ["foo", "bar"], "b": [1, 2]}) + >>> df2 = pd.DataFrame({"a": ["foo", "baz"], "c": [3, 4]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + >>> df1.merge(df2, how="left", on="a") + a b c + 0 foo 1 3.0 + 1 bar 2 NaN + + >>> df1 = pd.DataFrame({"left": ["foo", "bar"]}) + >>> df2 = pd.DataFrame({"right": [7, 8]}) + >>> df1 + left + 0 foo + 1 bar + >>> df2 + right + 0 7 + 1 8 + + >>> df1.merge(df2, how="cross") + left right + 0 foo 7 + 1 foo 8 + 2 bar 7 + 3 bar 8 + """ left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) @@ -316,7 +513,9 @@ def merge_ordered( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like @@ -474,7 +673,9 @@ def merge_asof( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, @@ -513,6 +714,7 @@ def merge_asof( Returns ------- DataFrame + A DataFrame of the two merged objects. See Also -------- diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e0126d439a79c..8c2c2053b0554 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -11,10 +11,6 @@ import numpy as np from pandas._libs import lib -from pandas.util._decorators import ( - Appender, - Substitution, -) from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -29,7 +25,6 @@ ) import pandas.core.common as com -from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( Index, @@ -54,10 +49,6 @@ from pandas import DataFrame -# Note: We need to make sure `frame` is imported before `pivot`, otherwise -# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot_table"], indents=1) def pivot_table( data: DataFrame, values=None, @@ -70,7 +61,178 @@ def pivot_table( margins_name: Hashable = "All", observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: + """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + values : list-like or scalar, optional + Column or columns to aggregate. + index : column, Grouper, array, or list of the previous + Keys to group by on the pivot table index. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + columns : column, Grouper, array, or list of the previous + Keys to group by on the pivot table column. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + aggfunc : function, list of functions, dict, default "mean" + If a list of functions is passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves). + If a dict is passed, the key is column to aggregate and the value is + function or list of functions. If ``margin=True``, aggfunc will be + used to calculate the partial aggregates. + fill_value : scalar, default None + Value to replace missing values with (in the resulting pivot table, + after aggregation). + margins : bool, default False + If ``margins=True``, special ``All`` columns and rows + will be added with partial group aggregates across the categories + on the rows and columns. + dropna : bool, default True + Do not include columns whose entries are all NaN. If True, + rows with a NaN value in any column will be omitted before + computing margins. + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 3.0.0 + + The default value is now ``True``. + + sort : bool, default True + Specifies if the result should be sorted. + + .. versionadded:: 1.3.0 + + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + ... "C": [ + ... "small", + ... "large", + ... "large", + ... "small", + ... "small", + ... "large", + ... "small", + ... "small", + ... "large", + ... ], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + ... } + ... ) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum" + ... ) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum", fill_value=0 + ... ) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table( + ... df, values=["D", "E"], index=["A", "C"], aggfunc={"D": "mean", "E": "mean"} + ... ) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table( + ... df, + ... values=["D", "E"], + ... index=["A", "C"], + ... aggfunc={"D": "mean", "E": ["min", "max", "mean"]}, + ... ) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 + """ index = _convert_by(index) columns = _convert_by(columns) @@ -90,6 +252,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + kwargs=kwargs, ) pieces.append(_table) keys.append(getattr(func, "__name__", func)) @@ -109,6 +272,7 @@ def pivot_table( margins_name, observed, sort, + kwargs, ) return table.__finalize__(data, method="pivot_table") @@ -125,6 +289,7 @@ def __internal_pivot_table( margins_name: Hashable, observed: bool, sort: bool, + kwargs, ) -> DataFrame: """ Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. @@ -167,7 +332,7 @@ def __internal_pivot_table( values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) - agged = grouped.agg(aggfunc) + agged = grouped.agg(aggfunc, **kwargs) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") @@ -222,6 +387,7 @@ def __internal_pivot_table( rows=index, cols=columns, aggfunc=aggfunc, + kwargs=kwargs, observed=dropna, margins_name=margins_name, fill_value=fill_value, @@ -247,6 +413,7 @@ def _add_margins( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", fill_value=None, @@ -259,7 +426,7 @@ def _add_margins( if margins_name in table.index.get_level_values(level): raise ValueError(msg) - grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) + grand_margin = _compute_grand_margin(data, values, aggfunc, kwargs, margins_name) if table.ndim == 2: # i.e. DataFrame @@ -280,7 +447,15 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name + table, + data, + values, + rows, + cols, + aggfunc, + kwargs, + observed, + margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -289,7 +464,7 @@ def _add_margins( # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name + table, data, rows, cols, aggfunc, kwargs, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -326,26 +501,26 @@ def _add_margins( def _compute_grand_margin( - data: DataFrame, values, aggfunc, margins_name: Hashable = "All" + data: DataFrame, values, aggfunc, kwargs, margins_name: Hashable = "All" ): if values: grand_margin = {} for k, v in data[values].items(): try: if isinstance(aggfunc, str): - grand_margin[k] = getattr(v, aggfunc)() + grand_margin[k] = getattr(v, aggfunc)(**kwargs) elif isinstance(aggfunc, dict): if isinstance(aggfunc[k], str): - grand_margin[k] = getattr(v, aggfunc[k])() + grand_margin[k] = getattr(v, aggfunc[k])(**kwargs) else: - grand_margin[k] = aggfunc[k](v) + grand_margin[k] = aggfunc[k](v, **kwargs) else: - grand_margin[k] = aggfunc(v) + grand_margin[k] = aggfunc(v, **kwargs) except TypeError: pass return grand_margin else: - return {margins_name: aggfunc(data.index)} + return {margins_name: aggfunc(data.index, **kwargs)} def _generate_marginal_results( @@ -355,6 +530,7 @@ def _generate_marginal_results( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -368,7 +544,11 @@ def _all_key(key): return (key, margins_name) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) + margin = ( + data[rows + values] + .groupby(rows, observed=observed) + .agg(aggfunc, **kwargs) + ) cat_axis = 1 for key, piece in table.T.groupby(level=0, observed=observed): @@ -393,7 +573,7 @@ def _all_key(key): table_pieces.append(piece) # GH31016 this is to calculate margin for each group, and assign # corresponded key as index - transformed_piece = DataFrame(piece.apply(aggfunc)).T + transformed_piece = DataFrame(piece.apply(aggfunc, **kwargs)).T if isinstance(piece.index, MultiIndex): # We are adding an empty level transformed_piece.index = MultiIndex.from_tuples( @@ -423,7 +603,9 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) + row_margin = ( + data[cols + values].groupby(cols, observed=observed).agg(aggfunc, **kwargs) + ) row_margin = row_margin.stack() # GH#26568. Use names instead of indices in case of numeric names @@ -442,6 +624,7 @@ def _generate_marginal_results_without_values( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -456,14 +639,16 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply( + aggfunc, **kwargs + ) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, observed=observed).apply(aggfunc) + margin = data.groupby(level=0, observed=observed).apply(aggfunc, **kwargs) all_key = _all_key() table[all_key] = margin result = table @@ -474,7 +659,9 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply( + aggfunc, **kwargs + ) else: row_margin = Series(np.nan, index=result.columns) @@ -495,8 +682,6 @@ def _convert_by(by): return by -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot"], indents=1) def pivot( data: DataFrame, *, @@ -504,6 +689,152 @@ def pivot( index: IndexLabel | lib.NoDefault = lib.no_default, values: IndexLabel | lib.NoDefault = lib.no_default, ) -> DataFrame: + """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + columns : str or object or a list of str + Column to use to make new frame's columns. + index : str or object or a list of str, optional + Column to use to make new frame's index. If not given, uses existing index. + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... "zoo": ["x", "y", "z", "q", "w", "t"], + ... } + ... ) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index="foo", columns="bar", values="baz") + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar")["baz"] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar", values=["baz", "zoo"]) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + You could also assign a list of column names or a list of index names. + + >>> df = pd.DataFrame( + ... { + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5], + ... } + ... ) + >>> df + lev1 lev2 lev3 lev4 values + 0 1 1 1 1 0 + 1 1 1 2 2 1 + 2 1 2 1 3 2 + 3 2 1 2 4 3 + 4 2 1 1 5 4 + 5 2 2 2 6 5 + + >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") + lev2 1 2 + lev3 1 2 1 2 + lev1 + 1 0.0 1.0 2.0 NaN + 2 4.0 3.0 NaN 5.0 + + >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") + lev3 1 2 + lev1 lev2 + 1 1 0.0 1.0 + 2 2.0 NaN + 2 1 4.0 3.0 + 2 NaN 5.0 + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two"], + ... "bar": ["A", "A", "B", "C"], + ... "baz": [1, 2, 3, 4], + ... } + ... ) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index="foo", columns="bar", values="baz") + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ columns_listlike = com.convert_to_list_like(columns) # If columns is None we will create a MultiIndex level with None as name diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 01cc85ceff181..664ac57fcc823 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -42,7 +42,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, + default_index, ) from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -137,24 +137,24 @@ def __init__( self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) - # Bug fix GH 20601 - # If the data frame is too big, the number of unique index combination - # will cause int32 overflow on windows environments. - # We want to check and raise an warning before this happens - num_rows = np.max([index_level.size for index_level in self.new_index_levels]) - num_columns = self.removed_level.size - - # GH20601: This forces an overflow if the number of cells is too high. - num_cells = num_rows * num_columns - - # GH 26314: Previous ValueError raised was too restrictive for many users. - if get_option("performance_warnings") and num_cells > np.iinfo(np.int32).max: - warnings.warn( - f"The following operation may generate {num_cells} cells " - f"in the resulting pandas object.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) + if get_option("performance_warnings"): + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an warning before this happens + num_rows = max(index_level.size for index_level in self.new_index_levels) + num_columns = self.removed_level.size + + # GH20601: This forces an overflow if the number of cells is too high. + # GH 26314: Previous ValueError raised was too restrictive for many users. + num_cells = num_rows * num_columns + if num_cells > np.iinfo(np.int32).max: + warnings.warn( + f"The following operation may generate {num_cells} cells " + f"in the resulting pandas object.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) self._make_selectors() @@ -168,6 +168,9 @@ def _indexer_and_to_sort( v = self.level codes = list(self.index.codes) + if not self.sort: + # Create new codes considering that labels are already sorted + codes = [factorize(code)[0] for code in codes] levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -186,12 +189,9 @@ def sorted_labels(self) -> list[np.ndarray]: return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - if self.sort: - indexer, _ = self._indexer_and_to_sort - - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values - return values + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values def _make_selectors(self) -> None: new_levels = self.new_index_levels @@ -394,7 +394,13 @@ def _repeater(self) -> np.ndarray: @cache_readonly def new_index(self) -> MultiIndex | Index: # Does not depend on values or value_columns - result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] + if self.sort: + labels = self.sorted_labels[:-1] + else: + v = self.level + codes = list(self.index.codes) + labels = codes[:v] + codes[v + 1 :] + result_codes = [lab.take(self.compressor) for lab in labels] # construct the new index if len(self.new_index_levels) == 1: @@ -731,10 +737,10 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) - levs = [ + levs = ( [lev[c] if c >= 0 else None for c in codes] for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) - ] + ) # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) @@ -836,7 +842,7 @@ def _convert_level_number(level_num: int, columns: Index): [x._values.astype(dtype, copy=False) for _, x in subset.items()] ) N, K = subset.shape - idx = np.arange(N * K).reshape(K, N).T.ravel() + idx = np.arange(N * K).reshape(K, N).T.reshape(-1) value_slice = value_slice.take(idx) else: value_slice = subset.values @@ -918,20 +924,33 @@ def _reorder_for_extension_array_stack( # idx is an indexer like # [c0r0, c1r0, c2r0, ..., # c0r1, c1r1, c2r1, ...] - idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() + idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.reshape(-1) return arr.take(idx) def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") - - # If we need to drop `level` from columns, it needs to be in descending order set_levels = set(level) - drop_levnums = sorted(level, reverse=True) stack_cols = frame.columns._drop_level_numbers( [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) + + result = stack_reshape(frame, level, set_levels, stack_cols) + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + ratio = 0 if frame.empty else len(result) // len(frame) + + index_levels: list | FrozenList + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = list(np.tile(frame.index.codes, (1, ratio))) + else: + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] + index_codes = list(np.tile(codes, (1, ratio))) + if len(level) > 1: # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] sorter = np.argsort(level) @@ -939,13 +958,72 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: ordered_stack_cols = stack_cols._reorder_ilevels(sorter) else: ordered_stack_cols = stack_cols - - stack_cols_unique = stack_cols.unique() ordered_stack_cols_unique = ordered_stack_cols.unique() + if isinstance(ordered_stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols_unique] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + + # error: Incompatible types in assignment (expression has type "list[ndarray[Any, + # dtype[Any]]]", variable has type "FrozenList") + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + + return result + + +def stack_reshape( + frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index +) -> Series | DataFrame: + """Reshape the data of a frame for stack. + + This function takes care of most of the work that stack needs to do. Caller + will sort the result once the appropriate index is set. + + Parameters + ---------- + frame: DataFrame + DataFrame that is to be stacked. + level: list of ints. + Levels of the columns to stack. + set_levels: set of ints. + Same as level, but as a set. + stack_cols: Index. + Columns of the result when the DataFrame is stacked. + + Returns + ------- + The data of behind the stacked DataFrame. + """ + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) # Grab data for each unique index to be stacked buf = [] - for idx in stack_cols_unique: + for idx in stack_cols.unique(): if len(frame.columns) == 1: data = frame.copy() else: @@ -969,13 +1047,11 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if data.ndim == 1: data.name = 0 else: - data.columns = RangeIndex(len(data.columns)) + data.columns = default_index(len(data.columns)) buf.append(data) - result: Series | DataFrame if len(buf) > 0 and not frame.empty: result = concat(buf, ignore_index=True) - ratio = len(result) // len(frame) else: # input is empty if len(level) < frame.columns.nlevels: @@ -984,7 +1060,6 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: else: new_columns = [0] result = DataFrame(columns=new_columns, dtype=frame._values.dtype) - ratio = 0 if len(level) < frame.columns.nlevels: # concat column order may be different from dropping the levels @@ -992,46 +1067,4 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if not result.columns.equals(desired_columns): result = result[desired_columns] - # Construct the correct MultiIndex by combining the frame's index and - # stacked columns. - index_levels: list | FrozenList - if isinstance(frame.index, MultiIndex): - index_levels = frame.index.levels - index_codes = list(np.tile(frame.index.codes, (1, ratio))) - else: - codes, uniques = factorize(frame.index, use_na_sentinel=False) - index_levels = [uniques] - index_codes = list(np.tile(codes, (1, ratio))) - if isinstance(ordered_stack_cols, MultiIndex): - column_levels = ordered_stack_cols.levels - column_codes = ordered_stack_cols.drop_duplicates().codes - else: - column_levels = [ordered_stack_cols.unique()] - column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] - # error: Incompatible types in assignment (expression has type "list[ndarray[Any, - # dtype[Any]]]", variable has type "FrozenList") - column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] - result.index = MultiIndex( - levels=index_levels + column_levels, - codes=index_codes + column_codes, - names=frame.index.names + list(ordered_stack_cols.names), - verify_integrity=False, - ) - - # sort result, but faster than calling sort_index since we know the order we need - len_df = len(frame) - n_uniques = len(ordered_stack_cols_unique) - indexer = np.arange(n_uniques) - idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) - result = result.take(idxs) - - # Reshape/rename if needed and dropna - if result.ndim == 2 and frame.columns.nlevels == len(level): - if len(result.columns) == 0: - result = Series(index=result.index) - else: - result = result.iloc[:, 0] - if result.ndim == 1: - result.name = None - return result diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 1499afbde56d3..d780433386395 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -289,6 +289,7 @@ def qcut( Parameters ---------- x : 1d ndarray or Series + Input Numpy array or pandas Series object to be discretized. q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. @@ -313,6 +314,11 @@ def qcut( bins : ndarray of floats Returned only if `retbins` is True. + See Also + -------- + cut : Bin values into discrete intervals. + Series.quantile : Return value at the given quantile. + Notes ----- Out of bounds values will be NA in the resulting Categorical object diff --git a/pandas/core/series.py b/pandas/core/series.py index 8a26d52bb5df1..3d1bd8ebb03cb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -101,7 +101,7 @@ ops, roperator, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray from pandas.core.arrays.arrow import ( @@ -246,7 +246,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] ---------- data : array-like, Iterable, dict, or scalar value Contains data stored in Series. If data is a dict, argument order is - maintained. + maintained. Unordered sets are not supported. index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to @@ -262,6 +262,11 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] copy : bool, default False Copy input data. Only affects Series or 1d ndarray input. See examples. + See Also + -------- + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Index : Immutable sequence used for indexing and alignment. + Notes ----- Please reference the :ref:`User Guide ` for more information. @@ -289,7 +294,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] z NaN dtype: float64 - Note that the Index is first build with the keys from the dictionary. + Note that the Index is first built with the keys from the dictionary. After this the Series is reindexed with the given Index values, hence we get all NaN as a result. @@ -384,10 +389,6 @@ def __init__( self.name = name return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - if isinstance(data, (ExtensionArray, np.ndarray)): if copy is not False: if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): @@ -433,7 +434,6 @@ def __init__( data = data.astype(dtype) refs = data._references - data = data._values copy = False elif isinstance(data, np.ndarray): @@ -507,17 +507,6 @@ def __init__( self.name = name self._set_axis(0, index) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Series " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - def _init_dict( self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): @@ -621,6 +610,13 @@ def dtype(self) -> DtypeObj: """ Return the dtype object of the underlying data. + See Also + -------- + Series.dtypes : Return the dtype object of the underlying data. + Series.astype : Cast a pandas object to a specified dtype dtype. + Series.convert_dtypes : Convert columns to the best possible dtypes using dtypes + supporting pd.NA. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -1479,6 +1475,13 @@ def to_string( str or None String representation of Series if ``buf=None``, otherwise None. + See Also + -------- + Series.to_dict : Convert Series to dict object. + Series.to_frame : Convert Series to DataFrame object. + Series.to_markdown : Print Series in Markdown-friendly format. + Series.to_timestamp : Cast to DatetimeIndex of Timestamps. + Examples -------- >>> ser = pd.Series([1, 2, 3]).to_string() @@ -1945,6 +1948,16 @@ def mode(self, dropna: bool = True) -> Series: Series Modes of the Series in sorted order. + See Also + -------- + numpy.mode : Equivalent numpy function for computing median. + Series.sum : Sum of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + Examples -------- >>> s = pd.Series([2, 4, 2, 2, 4, None]) @@ -2031,14 +2044,14 @@ def unique(self) -> ArrayLike: >>> pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() ['2016-01-01 00:00:00'] - Length: 1, dtype: datetime64[ns] + Length: 1, dtype: datetime64[s] >>> pd.Series( ... [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] ... ).unique() ['2016-01-01 00:00:00-05:00'] - Length: 1, dtype: datetime64[ns, US/Eastern] + Length: 1, dtype: datetime64[s, US/Eastern] An Categorical will return categories in the order of appearance and with the same dtype. @@ -3146,6 +3159,7 @@ def combine_first(self, other) -> Series: other = other.reindex(keep_other) if this.dtype.kind == "M" and other.dtype.kind != "M": + # TODO: try to match resos? other = to_datetime(other) combined = concat([this, other]) combined = combined.reindex(new_index) @@ -3946,26 +3960,44 @@ def nsmallest( """ return selectn.SelectNSeries(self, n=n, keep=keep).nsmallest() - @doc( - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """copy : bool, default True - Whether to copy underlying data. + def swaplevel( + self, i: Level = -2, j: Level = -1, copy: bool | lib.NoDefault = lib.no_default + ) -> Series: + """ + Swap levels i and j in a :class:`MultiIndex`. - .. note:: - The `copy` keyword will change behavior in pandas 3.0. - `Copy-on-Write - `__ - will be enabled by default, which means that all methods with a - `copy` keyword will use a lazy copy mechanism to defer the copy and - ignore the `copy` keyword. The `copy` keyword will be removed in a - future version of pandas. + Default is to swap the two innermost levels of the index. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + copy : bool, default True + Whether to copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy + and ignore the `copy` keyword. The `copy` keyword will be + removed in a future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + Series + Series with levels swapped in MultiIndex. + + See Also + -------- + DataFrame.swaplevel : Swap levels i and j in a :class:`DataFrame`. + Series.reorder_levels : Rearrange index levels using input order. + MultiIndex.swaplevel : Swap levels i and j in a :class:`MultiIndex`. - You can already get the future behavior and improvements through - enabling copy on write ``pd.options.mode.copy_on_write = True``""" - ), - examples=dedent( - """\ Examples -------- >>> s = pd.Series( @@ -4015,29 +4047,7 @@ def nsmallest( Geography Final exam February B History Coursework March A Geography Coursework April C - dtype: object""" - ), - ) - def swaplevel( - self, i: Level = -2, j: Level = -1, copy: bool | lib.NoDefault = lib.no_default - ) -> Series: - """ - Swap levels i and j in a :class:`MultiIndex`. - - Default is to swap the two innermost levels of the index. - - Parameters - ---------- - i, j : int or str - Levels of the indices to be swapped. Can pass level name as string. - {extra_params} - - Returns - ------- - {klass} - {klass} with levels swapped in MultiIndex. - - {examples} + dtype: object """ self._check_copy_deprecation(copy) assert isinstance(self.index, MultiIndex) @@ -4955,7 +4965,7 @@ def drop( C 2 dtype: int64 - Drop labels B en C + Drop labels B and C >>> s.drop(labels=["B", "C"]) A 0 @@ -5302,6 +5312,7 @@ def case_when( Returns ------- Series + A new Series with values replaced based on the provided conditions. See Also -------- @@ -5731,13 +5742,13 @@ def to_period( # ---------------------------------------------------------------------- # Accessor Methods # ---------------------------------------------------------------------- - str = CachedAccessor("str", StringMethods) - dt = CachedAccessor("dt", CombinedDatetimelikeProperties) - cat = CachedAccessor("cat", CategoricalAccessor) - plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) - sparse = CachedAccessor("sparse", SparseAccessor) - struct = CachedAccessor("struct", StructAccessor) - list = CachedAccessor("list", ListAccessor) + str = Accessor("str", StringMethods) + dt = Accessor("dt", CombinedDatetimelikeProperties) + cat = Accessor("cat", CategoricalAccessor) + plot = Accessor("plot", pandas.plotting.PlotAccessor) + sparse = Accessor("sparse", SparseAccessor) + struct = Accessor("struct", StructAccessor) + list = Accessor("list", ListAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series @@ -5893,7 +5904,6 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0 return op(self, other) - @Appender(ops.make_flex_doc("eq", "series")) def eq( self, other, @@ -5901,6 +5911,63 @@ def eq( fill_value: float | None = None, axis: Axis = 0, ) -> Series: + """ + Return Equal to of series and other, element-wise (binary operator `eq`). + + Equivalent to ``series == other``, but with support to substitute a fill_value + for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.ge : Return elementwise Greater than or equal to of series and other. + Series.le : Return elementwise Less than or equal to of series and other. + Series.gt : Return elementwise Greater than of series and other. + Series.lt : Return elementwise Less than of series and other. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.eq(b, fill_value=0) + a True + b False + c False + d False + e False + dtype: bool + """ return self._flex_method( other, operator.eq, level=level, fill_value=fill_value, axis=axis ) @@ -5911,8 +5978,68 @@ def ne(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.ne, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("le", "series")) def le(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Less than or equal to of series and other, \ + element-wise (binary operator `le`). + + Equivalent to ``series <= other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.ge : Return elementwise Greater than or equal to of series and other. + Series.lt : Return elementwise Less than of series and other. + Series.gt : Return elementwise Greater than of series and other. + Series.eq : Return elementwise equal to of series and other. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e']) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + e 1.0 + dtype: float64 + >>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f']) + >>> b + a 0.0 + b 1.0 + c 2.0 + d NaN + f 1.0 + dtype: float64 + >>> a.le(b, fill_value=0) + a False + b True + c True + d False + e False + f True + dtype: bool + """ return self._flex_method( other, operator.le, level=level, fill_value=fill_value, axis=axis ) @@ -5935,8 +6062,64 @@ def gt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.gt, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("add", "series")) def add(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Addition of series and other, element-wise (binary operator `add`). + + Equivalent to ``series + other``, but with support to substitute a fill_value + for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + With which to compute the addition. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.radd : Reverse of the Addition operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.add(b, fill_value=0) + a 2.0 + b 1.0 + c 1.0 + d 1.0 + e NaN + dtype: float64 + """ return self._flex_method( other, operator.add, level=level, fill_value=fill_value, axis=axis ) @@ -5961,7 +6144,6 @@ def rsub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, roperator.rsub, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("mul", "series")) def mul( self, other, @@ -5969,6 +6151,69 @@ def mul( fill_value: float | None = None, axis: Axis = 0, ) -> Series: + """ + Return Multiplication of series and other, element-wise (binary operator `mul`). + + Equivalent to ``series * other``, but with support to substitute + a fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + With which to compute the multiplication. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.rmul : Reverse of the Multiplication operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.multiply(b, fill_value=0) + a 1.0 + b 0.0 + c 0.0 + d 0.0 + e NaN + dtype: float64 + >>> a.mul(5, fill_value=0) + a 5.0 + b 5.0 + c 5.0 + d 0.0 + dtype: float64 + """ return self._flex_method( other, operator.mul, level=level, fill_value=fill_value, axis=axis ) @@ -5981,8 +6226,65 @@ def rmul(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, roperator.rmul, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("truediv", "series")) def truediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Floating division of series and other, \ + element-wise (binary operator `truediv`). + + Equivalent to ``series / other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + Series with which to compute division. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.rtruediv : Reverse of the Floating division operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.divide(b, fill_value=0) + a 1.0 + b inf + c inf + d 0.0 + e NaN + dtype: float64 + """ return self._flex_method( other, operator.truediv, level=level, fill_value=fill_value, axis=axis ) @@ -6010,8 +6312,64 @@ def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Serie other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("mod", "series")) def mod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Modulo of series and other, element-wise (binary operator `mod`). + + Equivalent to ``series % other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + Series with which to compute modulo. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.rmod : Reverse of the Modulo operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.mod(b, fill_value=0) + a 0.0 + b NaN + c NaN + d 0.0 + e NaN + dtype: float64 + """ return self._flex_method( other, operator.mod, level=level, fill_value=fill_value, axis=axis ) @@ -6132,7 +6490,6 @@ def all( ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") - @doc(make_doc("min", ndim=1)) def min( self, axis: Axis | None = 0, @@ -6140,12 +6497,70 @@ def min( numeric_only: bool = False, **kwargs, ): + """ + Return the minimum of the values over the requested axis. + + If you want the *index* of the minimum, use ``idxmin``. + This is the equivalent of the ``numpy.ndarray`` method ``argmin``. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The maximum of the values in the Series. + + See Also + -------- + numpy.min : Equivalent numpy function for arrays. + Series.min : Return the minimum. + Series.max : Return the maximum. + Series.idxmin : Return the index of the minimum. + Series.idxmax : Return the index of the maximum. + DataFrame.min : Return the minimum over the requested axis. + DataFrame.max : Return the maximum over the requested axis. + DataFrame.idxmin : Return the index of the minimum over the requested axis. + DataFrame.idxmax : Return the index of the maximum over the requested axis. + + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.min() + 0 + """ return NDFrame.min( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") - @doc(make_doc("max", ndim=1)) def max( self, axis: Axis | None = 0, @@ -6153,12 +6568,70 @@ def max( numeric_only: bool = False, **kwargs, ): + """ + Return the maximum of the values over the requested axis. + + If you want the *index* of the maximum, use ``idxmax``. + This is the equivalent of the ``numpy.ndarray`` method ``argmax``. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The maximum of the values in the Series. + + See Also + -------- + numpy.max : Equivalent numpy function for arrays. + Series.min : Return the minimum. + Series.max : Return the maximum. + Series.idxmin : Return the index of the minimum. + Series.idxmax : Return the index of the maximum. + DataFrame.min : Return the minimum over the requested axis. + DataFrame.max : Return the maximum over the requested axis. + DataFrame.idxmin : Return the index of the minimum over the requested axis. + DataFrame.idxmax : Return the index of the maximum over the requested axis. + + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.max() + 8 + """ return NDFrame.max( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") - @doc(make_doc("sum", ndim=1)) def sum( self, axis: Axis | None = None, @@ -6167,6 +6640,89 @@ def sum( min_count: int = 0, **kwargs, ): + """ + Return the sum of the values over the requested axis. + + This is equivalent to the method ``numpy.sum``. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sum with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Median of the values for the requested axis. + + See Also + -------- + numpy.sum : Equivalent numpy function for computing sum. + Series.mean : Mean of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.sum() + 14 + + By default, the sum of an empty or all-NA Series is ``0``. + + >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default + 0.0 + + This can be controlled with the ``min_count`` parameter. For example, if + you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + + >>> pd.Series([], dtype="float64").sum(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. + + >>> pd.Series([np.nan]).sum() + 0.0 + + >>> pd.Series([np.nan]).sum(min_count=1) + nan + """ return NDFrame.sum( self, axis=axis, @@ -6196,7 +6752,6 @@ def prod( ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") - @doc(make_doc("mean", ndim=1)) def mean( self, axis: Axis | None = 0, @@ -6204,12 +6759,53 @@ def mean( numeric_only: bool = False, **kwargs, ) -> Any: + """ + Return the mean of the values over the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Median of the values for the requested axis. + + See Also + -------- + numpy.median : Equivalent numpy function for computing median. + Series.sum : Sum of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.mean() + 2.0 + """ return NDFrame.mean( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") - @doc(make_doc("median", ndim=1)) def median( self, axis: Axis | None = 0, @@ -6217,6 +6813,75 @@ def median( numeric_only: bool = False, **kwargs, ) -> Any: + """ + Return the median of the values over the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Median of the values for the requested axis. + + See Also + -------- + numpy.median : Equivalent numpy function for computing median. + Series.sum : Sum of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.median() + 2.0 + + With a DataFrame + + >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.median() + a 1.5 + b 2.5 + dtype: float64 + + Using axis=1 + + >>> df.median(axis=1) + tiger 1.5 + zebra 2.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) + >>> df.median(numeric_only=True) + a 1.5 + dtype: float64 + """ return NDFrame.median( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) @@ -6241,7 +6906,6 @@ def sem( ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") - @doc(make_doc("var", ndim=1)) def var( self, axis: Axis | None = None, @@ -6250,6 +6914,75 @@ def var( numeric_only: bool = False, **kwargs, ): + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.var with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + scalar or Series (if level specified) + Unbiased variance over requested axis. + + See Also + -------- + numpy.var : Equivalent function in NumPy. + Series.std : Returns the standard deviation of the Series. + DataFrame.var : Returns the variance of the DataFrame. + DataFrame.std : Return standard deviation of the values over + the requested axis. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "person_id": [0, 1, 2, 3], + ... "age": [21, 25, 62, 43], + ... "height": [1.61, 1.87, 1.49, 2.01], + ... } + ... ).set_index("person_id") + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + >>> df.var() + age 352.916667 + height 0.056367 + dtype: float64 + + Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: + + >>> df.var(ddof=0) + age 264.687500 + height 0.042275 + dtype: float64 + """ return NDFrame.var( self, axis=axis, @@ -6292,7 +7025,6 @@ def skew( ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") - @doc(make_doc("kurt", ndim=1)) def kurt( self, axis: Axis | None = 0, @@ -6300,6 +7032,54 @@ def kurt( numeric_only: bool = False, **kwargs, ): + """ + Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar + Unbiased kurtosis. + + See Also + -------- + Series.skew : Return unbiased skew over requested axis. + Series.var : Return unbiased variance over requested axis. + Series.std : Return unbiased standard deviation over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) + >>> s + cat 1 + dog 2 + dog 2 + mouse 3 + dtype: int64 + >>> s.kurt() + 1.5 + """ return NDFrame.kurt( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d274c1d7a5aff..7494a43caf004 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -162,6 +162,16 @@ class StringMethods(NoNewAttributesMixin): Patterned after Python's string methods, with some inspiration from R's stringr package. + Parameters + ---------- + data : Series or Index + The content of the Series or Index. + + See Also + -------- + Series.str : Vectorized string functions for Series. + Index.str : Vectorized string functions for Index. + Examples -------- >>> s = pd.Series(["A_Str_Series"]) @@ -2398,7 +2408,11 @@ def translate(self, table): """ Map all characters in the string through the given mapping table. - Equivalent to standard :meth:`str.translate`. + This method is equivalent to the standard :meth:`str.translate` + method for strings. It maps each character in the string to a new + character according to the translation table provided. Unmapped + characters are left unchanged, while characters mapped to None + are removed. Parameters ---------- @@ -2411,6 +2425,14 @@ def translate(self, table): Returns ------- Series or Index + A new Series or Index with translated strings. + + See Also + -------- + Series.str.replace : Replace occurrences of pattern/regex in the + Series with some other string. + Index.str.replace : Replace occurrences of pattern/regex in the + Index with some other string. Examples -------- diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b01cdb335ec46..c116ef015ae16 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -29,6 +29,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -524,6 +525,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: utc=utc, errors=errors, unit_for_numerics=unit, + creso=NpyDatetimeUnit.NPY_FR_ns.value, ) result = DatetimeIndex(arr, name=name) @@ -873,7 +875,7 @@ def to_datetime( >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 - dtype: datetime64[ns] + dtype: datetime64[s] Using a unix epoch time @@ -903,7 +905,7 @@ def to_datetime( Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime("13000101", format="%Y%m%d", errors="coerce") + >>> pd.to_datetime("invalid for Ymd", format="%Y%m%d", errors="coerce") NaT .. _to_datetime_tz_examples: @@ -916,14 +918,14 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00:00", "2018-10-26 13:00:15"]) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to timezone-aware :class:`DatetimeIndex`: >>> pd.to_datetime(["2018-10-26 12:00 -0500", "2018-10-26 13:00 -0500"]) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], - dtype='datetime64[ns, UTC-05:00]', freq=None) + dtype='datetime64[s, UTC-05:00]', freq=None) - However, timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris) @@ -965,21 +967,21 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00", "2018-10-26 13:00"], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Timezone-aware inputs are *converted* to UTC (the output represents the exact same datetime, but viewed from the UTC time offset `+00:00`). >>> pd.to_datetime(["2018-10-26 12:00 -0530", "2018-10-26 12:00 -0500"], utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Inputs can contain both string or datetime, the above rules still apply >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[us, UTC]', freq=None) """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 3b9dd40a92ce8..e120e69dc27cf 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -244,6 +244,7 @@ def hash_array( Parameters ---------- vals : ndarray or ExtensionArray + The input array to hash. encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key @@ -257,6 +258,11 @@ def hash_array( ndarray[np.uint64, ndim=1] Hashed values, same length as the vals. + See Also + -------- + util.hash_pandas_object : Return a data hash of the Index/Series/DataFrame. + util.hash_tuples : Hash an MultiIndex / listlike-of-tuples efficiently. + Examples -------- >>> pd.util.hash_array(np.array([1, 2, 3])) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index a6079785e7475..d93984d210cb4 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -2,6 +2,7 @@ from __future__ import annotations +import inspect import types from typing import ( TYPE_CHECKING, @@ -54,10 +55,15 @@ def get_jit_arguments( engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) - if kwargs and nopython: + if kwargs: + # Note: in case numba supports keyword-only arguments in + # a future version, we should remove this check. But this + # seems unlikely to happen soon. + raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" + "numba does not support keyword-only arguments" + "https://github.com/numba/numba/issues/2916, " + "https://github.com/numba/numba/issues/6846" ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) @@ -97,3 +103,47 @@ def jit_user_function(func: Callable) -> Callable: numba_func = numba.extending.register_jitable(func) return numba_func + + +_sentinel = object() + + +def prepare_function_arguments( + func: Callable, args: tuple, kwargs: dict +) -> tuple[tuple, dict]: + """ + Prepare arguments for jitted function. As numba functions do not support kwargs, + we try to move kwargs into args if possible. + + Parameters + ---------- + func : function + user defined function + args : tuple + user input positional arguments + kwargs : dict + user input keyword arguments + + Returns + ------- + tuple[tuple, dict] + args, kwargs + + """ + if not kwargs: + return args, kwargs + + # the udf should have this pattern: def udf(value, *args, **kwargs):... + signature = inspect.signature(func) + arguments = signature.bind(_sentinel, *args, **kwargs) + arguments.apply_defaults() + # Ref: https://peps.python.org/pep-0362/ + # Arguments which could be passed as part of either *args or **kwargs + # will be included only in the BoundArguments.args attribute. + args = arguments.args + kwargs = arguments.kwargs + + assert args[0] is _sentinel + args = args[1:] + + return args, kwargs diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index abe853a8aa259..f14954cd9a4b0 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -8,10 +8,7 @@ Literal, ) -from pandas.util._decorators import ( - deprecate_kwarg, - doc, -) +from pandas.util._decorators import doc from pandas.core.indexers.objects import ( BaseIndexer, @@ -709,7 +706,6 @@ def kurt(self, numeric_only: bool = False): aggregation_description="quantile", agg_method="quantile", ) - @deprecate_kwarg(old_arg_name="quantile", new_arg_name="q") def quantile( self, q: float, diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index eb06479fc325e..824cf936b8185 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -227,10 +227,10 @@ def roll_table( stop = end[i] window = values[start:stop] count_nan = np.sum(np.isnan(window), axis=0) - sub_result = numba_func(window, *args) nan_mask = len(window) - count_nan >= minimum_periods + if nan_mask.any(): + result[i, :] = numba_func(window, *args) min_periods_mask[i, :] = nan_mask - result[i, :] = sub_result result = np.where(min_periods_mask, result, np.nan) return result diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index db6078ae636e3..2243d8dd1a613 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -27,10 +27,7 @@ import pandas._libs.window.aggregations as window_aggregations from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError -from pandas.util._decorators import ( - deprecate_kwarg, - doc, -) +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( ensure_float64, @@ -2556,7 +2553,6 @@ def kurt(self, numeric_only: bool = False): aggregation_description="quantile", agg_method="quantile", ) - @deprecate_kwarg(old_arg_name="quantile", new_arg_name="q") def quantile( self, q: float, diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index f01fe8ecef930..c8863e1b39c94 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -229,7 +229,6 @@ class ParserWarning(Warning): 1. `sep` other than a single character (e.g. regex separators) 2. `skipfooter` higher than 0 - 3. `sep=None` with `delim_whitespace=False` The warning can be avoided by adding `engine='python'` as a parameter in `pd.read_csv` and `pd.read_table` methods. diff --git a/pandas/errors/cow.py b/pandas/errors/cow.py index 1e7829c88ae7e..4815a0f5328d7 100644 --- a/pandas/errors/cow.py +++ b/pandas/errors/cow.py @@ -8,7 +8,7 @@ "the assignment in a single step.\n\n" "See the caveats in the documentation: " "https://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" + "copy_on_write.html" ) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index aa20ec237e968..5a0a8c321e629 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -113,9 +113,8 @@ def read_clipboard( if index_length != 0: kwargs.setdefault("index_col", list(range(index_length))) - # Edge case where sep is specified to be None, return to default - if sep is None and kwargs.get("delim_whitespace") is None: - sep = r"\s+" + elif not isinstance(sep, str): + raise ValueError(f"{sep=} must be a string") # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6063ac098a4dc..1eb22d4ee9de7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -240,20 +240,6 @@ For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. -date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -398,7 +384,6 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | lib.NoDefault = ..., date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., @@ -436,7 +421,6 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | lib.NoDefault = ..., date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., @@ -474,7 +458,6 @@ def read_excel( na_filter: bool = True, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -521,7 +504,6 @@ def read_excel( na_filter=na_filter, verbose=verbose, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, decimal=decimal, @@ -726,7 +708,6 @@ def parse( na_values=None, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -795,7 +776,6 @@ def parse( false_values=false_values, na_values=na_values, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, decimal=decimal, @@ -829,7 +809,6 @@ def _parse_sheet( false_values: Iterable[Hashable] | None = None, na_values=None, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -878,24 +857,23 @@ def _parse_sheet( # a row containing just the index name(s) has_index_names = False if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] + index_col_set: set[int] if isinstance(index_col, int): - index_col_list = [index_col] + index_col_set = {index_col} else: assert isinstance(index_col, Sequence) - index_col_list = index_col + index_col_set = set(index_col) # We have to handle mi without names. If any of the entries in the data # columns are not empty, this is a regular row assert isinstance(header, Sequence) if len(header) < len(data): potential_index_names = data[len(header)] - potential_data = [ - x + has_index_names = all( + x == "" or x is None for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) + if not control_row[i] and i not in index_col_set + ) if is_list_like(index_col): # Forward fill values for MultiIndex index. @@ -942,7 +920,6 @@ def _parse_sheet( na_values=na_values, skip_blank_lines=False, # GH 39808 parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, decimal=decimal, @@ -1479,9 +1456,9 @@ def inspect_excel_format( with zipfile.ZipFile(stream) as zf: # Workaround for some third party files that use forward slashes and # lower case names. - component_names = [ + component_names = { name.replace("\\", "/").lower() for name in zf.namelist() - ] + } if "xl/workbook.xml" in component_names: return "xlsx" @@ -1648,7 +1625,6 @@ def parse( nrows: int | None = None, na_values=None, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, thousands: str | None = None, comment: str | None = None, @@ -1737,20 +1713,6 @@ def parse( ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. - date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -1810,7 +1772,6 @@ def parse( nrows=nrows, na_values=na_values, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, comment=comment, diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 69b514da32857..f79417d11080d 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -122,29 +122,25 @@ def get_sheet_data( table: list[list[Scalar | NaTType]] = [] for sheet_row in sheet_rows: - sheet_cells = [ - x - for x in sheet_row.childNodes - if hasattr(x, "qname") and x.qname in cell_names - ] empty_cells = 0 table_row: list[Scalar | NaTType] = [] - for sheet_cell in sheet_cells: - if sheet_cell.qname == table_cell_name: - value = self._get_cell_value(sheet_cell) - else: - value = self.empty_value - - column_repeat = self._get_column_repeat(sheet_cell) - - # Queue up empty values, writing only if content succeeds them - if value == self.empty_value: - empty_cells += column_repeat - else: - table_row.extend([self.empty_value] * empty_cells) - empty_cells = 0 - table_row.extend([value] * column_repeat) + for sheet_cell in sheet_row.childNodes: + if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names: + if sheet_cell.qname == table_cell_name: + value = self._get_cell_value(sheet_cell) + else: + value = self.empty_value + + column_repeat = self._get_column_repeat(sheet_cell) + + # Queue up empty values, writing only if content succeeds them + if value == self.empty_value: + empty_cells += column_repeat + else: + table_row.extend([self.empty_value] * empty_cells) + empty_cells = 0 + table_row.extend([value] * column_repeat) if max_row_len < len(table_row): max_row_len = len(table_row) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index a444970792e6e..5d39a840336eb 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] - nrows = sheet.nrows if file_rows_needed is not None: nrows = min(nrows, file_rows_needed) - for i in range(nrows): - row = [ + return [ + [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) ] - data.append(row) - - return data + for i in range(nrows) + ] diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index b42dbaa579ee7..16d4e1f9ea25d 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Any, ) +import warnings from pandas._config import using_pyarrow_string_dtype @@ -107,6 +108,14 @@ def read_feather( type of object stored in file DataFrame object stored in the file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_sas : Read SAS file into a pandas DataFrame. + Examples -------- >>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP @@ -123,9 +132,16 @@ def read_feather( path, "rb", storage_options=storage_options, is_text=False ) as handles: if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b2b0d711c6b54..8212b50594842 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -4,7 +4,6 @@ from __future__ import annotations -from contextlib import contextmanager import copy from functools import partial import operator @@ -56,7 +55,6 @@ if TYPE_CHECKING: from collections.abc import ( - Generator, Hashable, Sequence, ) @@ -84,22 +82,6 @@ from pandas import ExcelWriter -try: - import matplotlib as mpl - import matplotlib.pyplot as plt - - has_mpl = True -except ImportError: - has_mpl = False - - -@contextmanager -def _mpl(func: Callable) -> Generator[tuple[Any, Any], None, None]: - if has_mpl: - yield plt, mpl - else: - raise ImportError(f"{func.__name__} requires matplotlib.") - #### # Shared Doc Strings @@ -3800,7 +3782,7 @@ def _validate_apply_axis_arg( f"operations is a Series with 'axis in [0,1]'" ) if isinstance(arg, (Series, DataFrame)): # align indx / cols to data - arg = arg.reindex_like(data, method=None).to_numpy(**dtype) + arg = arg.reindex_like(data).to_numpy(**dtype) else: arg = np.asarray(arg, **dtype) assert isinstance(arg, np.ndarray) # mypy requirement @@ -3832,61 +3814,61 @@ def _background_gradient( else: # else validate gmap against the underlying data gmap = _validate_apply_axis_arg(gmap, "gmap", float, data) - with _mpl(Styler.background_gradient) as (_, _matplotlib): - smin = np.nanmin(gmap) if vmin is None else vmin - smax = np.nanmax(gmap) if vmax is None else vmax - rng = smax - smin - # extend lower / upper bounds, compresses color range - norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + smin = np.nanmin(gmap) if vmin is None else vmin + smax = np.nanmax(gmap) if vmax is None else vmax + rng = smax - smin + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.background_gradient requires matplotlib." + ) + # extend lower / upper bounds, compresses color range + norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + + if cmap is None: + rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]](norm(gmap)) + else: + rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) + + def relative_luminance(rgba) -> float: + """ + Calculate relative luminance of a color. + + The calculation adheres to the W3C standards + (https://www.w3.org/WAI/GL/wiki/Relative_luminance) + + Parameters + ---------- + color : rgb or rgba tuple + + Returns + ------- + float + The relative luminance as a value from 0 to 1 + """ + r, g, b = ( + x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 + for x in rgba[:3] + ) + return 0.2126 * r + 0.7152 * g + 0.0722 * b - if cmap is None: - rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]]( - norm(gmap) + def css(rgba, text_only) -> str: + if not text_only: + dark = relative_luminance(rgba) < text_color_threshold + text_color = "#f1f1f1" if dark else "#000000" + return ( + f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" + f"color: {text_color};" ) else: - rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) - - def relative_luminance(rgba) -> float: - """ - Calculate relative luminance of a color. - - The calculation adheres to the W3C standards - (https://www.w3.org/WAI/GL/wiki/Relative_luminance) - - Parameters - ---------- - color : rgb or rgba tuple - - Returns - ------- - float - The relative luminance as a value from 0 to 1 - """ - r, g, b = ( - x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 - for x in rgba[:3] - ) - return 0.2126 * r + 0.7152 * g + 0.0722 * b - - def css(rgba, text_only) -> str: - if not text_only: - dark = relative_luminance(rgba) < text_color_threshold - text_color = "#f1f1f1" if dark else "#000000" - return ( - f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" - f"color: {text_color};" - ) - else: - return f"color: {_matplotlib.colors.rgb2hex(rgba)};" + return f"color: {_matplotlib.colors.rgb2hex(rgba)};" - if data.ndim == 1: - return [css(rgba, text_only) for rgba in rgbas] - else: - return DataFrame( - [[css(rgba, text_only) for rgba in row] for row in rgbas], - index=data.index, - columns=data.columns, - ) + if data.ndim == 1: + return [css(rgba, text_only) for rgba in rgbas] + else: + return DataFrame( + [[css(rgba, text_only) for rgba in row] for row in rgbas], + index=data.index, + columns=data.columns, + ) def _highlight_between( @@ -4124,20 +4106,22 @@ def css_calc(x, left: float, right: float, align: str, color: str | list | tuple rgbas = None if cmap is not None: # use the matplotlib colormap input - with _mpl(Styler.bar) as (_, _matplotlib): - cmap = ( - _matplotlib.colormaps[cmap] - if isinstance(cmap, str) - else cmap # assumed to be a Colormap instance as documented - ) - norm = _matplotlib.colors.Normalize(left, right) - rgbas = cmap(norm(values)) - if data.ndim == 1: - rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] - else: - rgbas = [ - [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas - ] + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.bar requires matplotlib." + ) + cmap = ( + _matplotlib.colormaps[cmap] + if isinstance(cmap, str) + else cmap # assumed to be a Colormap instance as documented + ) + norm = _matplotlib.colors.Normalize(left, right) + rgbas = cmap(norm(values)) + if data.ndim == 1: + rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] + else: + rgbas = [ + [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas + ] assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero] if data.ndim == 1: diff --git a/pandas/io/html.py b/pandas/io/html.py index 42f5266e7649b..db4c5f8507946 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1178,7 +1178,7 @@ def read_html( **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* - it will fail, e.g., it will *not* return an empty list. + it will fail, i.e., it will *not* return an empty list. Examples -------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 13d74e935f786..ff01d2f62761b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -369,18 +369,22 @@ def __init__( msg = "Overlapping names between the index and columns" raise ValueError(msg) - obj = obj.copy() timedeltas = obj.select_dtypes(include=["timedelta"]).columns + copied = False if len(timedeltas): + obj = obj.copy() + copied = True obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat()) - # Convert PeriodIndex to datetimes before serializing - if isinstance(obj.index.dtype, PeriodDtype): - obj.index = obj.index.to_timestamp() # exclude index from obj if index=False if not self.index: self.obj = obj.reset_index(drop=True) else: + # Convert PeriodIndex to datetimes before serializing + if isinstance(obj.index.dtype, PeriodDtype): + if not copied: + obj = obj.copy(deep=False) + obj.index = obj.index.to_timestamp() self.obj = obj.reset_index(drop=False) self.date_format = "iso" self.orient = "records" diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index d4b412404c308..b44aecff79779 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -114,7 +114,7 @@ def set_default_names(data): ) return data - data = data.copy() + data = data.copy(deep=False) if data.index.nlevels > 1: data.index.names = com.fill_missing_names(data.index.names) else: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 476856e8038d6..3bca8ea7ef1df 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -73,7 +73,7 @@ def read_orc( .. versionadded:: 2.0 filesystem : fsspec or pyarrow filesystem, default None - Filesystem object to use when reading the parquet file. + Filesystem object to use when reading the orc file. .. versionadded:: 2.1.0 @@ -85,6 +85,14 @@ def read_orc( DataFrame DataFrame based on the ORC file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_sas : Load a SAS file into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Notes ----- Before using this function you should read the :ref:`user guide about ORC ` @@ -99,7 +107,7 @@ def read_orc( -------- >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP """ - # we require a newer version of pyarrow than we support for parquet + # we require a newer version of pyarrow than we support for orc orc = import_optional_dependency("pyarrow.orc") diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 08983ceed44e5..306b144811898 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,7 +10,10 @@ Any, Literal, ) -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) from pandas._config import using_pyarrow_string_dtype @@ -271,7 +274,13 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = pa_table.to_pandas(**to_pandas_kwargs) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -384,7 +393,15 @@ def read( try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) - return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + return parquet_file.to_pandas( + columns=columns, filters=filters, **kwargs + ) finally: if handles is not None: handles.close() diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index f8263a65ef5c7..cffdb28e2c9e4 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -174,8 +174,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: self.names = list(range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names - # we only need the frame not the names - _, frame = self._do_date_conversions(frame.columns, frame) + + frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: index_to_set = self.index_col.copy() for i, item in enumerate(self.index_col): @@ -287,17 +287,23 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + if dtype_backend == "pyarrow": + frame = table.to_pandas(types_mapper=pd.ArrowDtype) + elif dtype_backend == "numpy_nullable": + # Modify the default mapping to also + # map null to Int64 (to match other engines) + dtype_mapping = _arrow_dtype_mapping() + dtype_mapping[pa.null()] = pd.Int64Dtype() + frame = table.to_pandas(types_mapper=dtype_mapping.get) + elif using_pyarrow_string_dtype(): + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - frame = table.to_pandas() + else: + frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 510097aed2a25..c6cc85b9f722b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -3,9 +3,7 @@ from collections import defaultdict from copy import copy import csv -import datetime from enum import Enum -import itertools from typing import ( TYPE_CHECKING, Any, @@ -24,7 +22,6 @@ ) import pandas._libs.ops as libops from pandas._libs.parsers import STR_NA_VALUES -from pandas._libs.tslibs import parsing from pandas.compat._optional import import_optional_dependency from pandas.errors import ( ParserError, @@ -34,7 +31,6 @@ from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_dict_like, is_extension_array_dtype, @@ -43,7 +39,6 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype, pandas_dtype, ) @@ -58,7 +53,6 @@ DataFrame, DatetimeIndex, StringDtype, - concat, ) from pandas.core import algorithms from pandas.core.arrays import ( @@ -111,7 +105,6 @@ class BadLineHandleMethod(Enum): keep_default_na: bool dayfirst: bool cache_dates: bool - keep_date_col: bool usecols_dtype: str | None def __init__(self, kwds) -> None: @@ -125,12 +118,19 @@ def __init__(self, kwds) -> None: self.index_names: Sequence[Hashable] | None = None self.col_names: Sequence[Hashable] | None = None - self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) - self._parse_date_cols: Iterable = [] + parse_dates = kwds.pop("parse_dates", False) + if parse_dates is None or lib.is_bool(parse_dates): + parse_dates = bool(parse_dates) + elif not isinstance(parse_dates, list): + raise TypeError( + "Only booleans and lists are accepted " + "for the 'parse_dates' parameter" + ) + self.parse_dates: bool | list = parse_dates + self._parse_date_cols: set = set() self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) - self.keep_date_col = kwds.pop("keep_date_col", False) self.na_values = kwds.get("na_values") self.na_fvalues = kwds.get("na_fvalues") @@ -146,7 +146,6 @@ def __init__(self, kwds) -> None: self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( - date_parser=self.date_parser, date_format=self.date_format, dayfirst=self.dayfirst, cache_dates=self.cache_dates, @@ -180,8 +179,6 @@ def __init__(self, kwds) -> None: else: self.index_col = list(self.index_col) - self._name_processed = False - self._first_chunk = True self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) @@ -190,7 +187,7 @@ def __init__(self, kwds) -> None: # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: + def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: """ Check if parse_dates are in columns. @@ -204,7 +201,7 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl Returns ------- - The names of the columns which will get parsed later if a dict or list + The names of the columns which will get parsed later if a list is given as specification. Raises @@ -213,30 +210,15 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl If column to parse_date is not in dataframe. """ - cols_needed: Iterable - if is_dict_like(self.parse_dates): - cols_needed = itertools.chain(*self.parse_dates.values()) - elif is_list_like(self.parse_dates): - # a column in parse_dates could be represented - # ColReference = Union[int, str] - # DateGroups = List[ColReference] - # ParseDates = Union[DateGroups, List[DateGroups], - # Dict[ColReference, DateGroups]] - cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) and not isinstance(col, tuple) else [col] - for col in self.parse_dates - ) - else: - cols_needed = [] - - cols_needed = list(cols_needed) + if not isinstance(self.parse_dates, list): + return set() # get only columns that are references using names (str), not by index missing_cols = ", ".join( sorted( { col - for col in cols_needed + for col in self.parse_dates if isinstance(col, str) and col not in columns } ) @@ -246,27 +228,18 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl f"Missing column provided to 'parse_dates': '{missing_cols}'" ) # Convert positions to actual column names - return [ + return { col if (isinstance(col, str) or col in columns) else columns[col] - for col in cols_needed - ] + for col in self.parse_dates + } def close(self) -> None: pass - @final - @property - def _has_complex_date_col(self) -> bool: - return isinstance(self.parse_dates, dict) or ( - isinstance(self.parse_dates, list) - and len(self.parse_dates) > 0 - and isinstance(self.parse_dates[0], list) - ) - @final def _should_parse_dates(self, i: int) -> bool: - if lib.is_bool(self.parse_dates): - return bool(self.parse_dates) + if isinstance(self.parse_dates, bool): + return self.parse_dates else: if self.index_names is not None: name = self.index_names[i] @@ -368,18 +341,9 @@ def _make_index( index: Index | None if not is_index_col(self.index_col) or not self.index_col: index = None - - elif not self._has_complex_date_col: + else: simple_index = self._get_simple_index(alldata, columns) index = self._agg_index(simple_index) - elif self._has_complex_date_col: - if not self._name_processed: - (self.index_names, _, self.index_col) = self._clean_index_names( - list(columns), self.index_col - ) - self._name_processed = True - date_index = self._get_complex_date_index(data, columns) - index = self._agg_index(date_index, try_parse_dates=False) # add names for the index if indexnamerow: @@ -415,34 +379,6 @@ def ix(col): return index - @final - def _get_complex_date_index(self, data, col_names): - def _get_name(icol): - if isinstance(icol, str): - return icol - - if col_names is None: - raise ValueError(f"Must supply column order to use {icol!s} as index") - - for i, c in enumerate(col_names): - if i == icol: - return c - - to_remove = [] - index = [] - for idx in self.index_col: - name = _get_name(idx) - to_remove.append(name) - index.append(data[name]) - - # remove index items from content and columns, don't pop in - # loop - for c in sorted(to_remove, reverse=True): - data.pop(c) - col_names.remove(c) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" @@ -645,19 +581,7 @@ def _set(x) -> int: if isinstance(self.parse_dates, list): for val in self.parse_dates: - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) + noconvert_columns.add(_set(val)) elif self.parse_dates: if isinstance(self.index_col, list): @@ -858,36 +782,33 @@ def _do_date_conversions( self, names: Index, data: DataFrame, - ) -> tuple[Sequence[Hashable] | Index, DataFrame]: ... + ) -> DataFrame: ... @overload def _do_date_conversions( self, names: Sequence[Hashable], data: Mapping[Hashable, ArrayLike], - ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: ... + ) -> Mapping[Hashable, ArrayLike]: ... @final def _do_date_conversions( self, names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, - ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: - # returns data, columns - - if self.parse_dates is not None: - data, names = _process_date_conversion( + ) -> Mapping[Hashable, ArrayLike] | DataFrame: + if isinstance(self.parse_dates, list): + return _process_date_conversion( data, self._date_conv, self.parse_dates, self.index_col, self.index_names, names, - keep_date_col=self.keep_date_col, dtype_backend=self.dtype_backend, ) - return names, data + return data @final def _check_data_length( @@ -1120,84 +1041,37 @@ def _get_empty_meta( def _make_date_converter( - date_parser=lib.no_default, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - if date_parser is not lib.no_default: - warnings.warn( - "The argument 'date_parser' is deprecated and will " - "be removed in a future version. " - "Please use 'date_format' instead, or read your data in as 'object' dtype " - "and then call 'to_datetime'.", - FutureWarning, - stacklevel=find_stack_level(), + def converter(date_col, col: Hashable): + if date_col.dtype.kind in "Mm": + return date_col + + date_fmt = ( + date_format.get(col) if isinstance(date_format, dict) else date_format ) - if date_parser is not lib.no_default and date_format is not None: - raise TypeError("Cannot use both 'date_parser' and 'date_format'") - - def unpack_if_single_element(arg): - # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 - if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: - return arg[0] - return arg - - def converter(*date_cols, col: Hashable): - if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": - return date_cols[0] - - if date_parser is lib.no_default: - strs = parsing.concat_date_cols(date_cols) - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format + + str_objs = lib.ensure_string_array(date_col) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs - str_objs = ensure_object(strs) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - else: - try: - pre_parsed = date_parser( - *(unpack_if_single_element(arg) for arg in date_cols) - ) - try: - result = tools.to_datetime( - pre_parsed, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_read_csv_with_custom_date_parser - result = pre_parsed - if isinstance(result, datetime.datetime): - raise Exception("scalar parser") - return result - except Exception: - # e.g. test_datetime_fractional_seconds - pre_parsed = parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), - parser=date_parser, - ) - try: - return tools.to_datetime(pre_parsed) - except (ValueError, TypeError): - # TODO: not reached in tests 2023-10-27; needed? - return pre_parsed + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values return converter @@ -1228,9 +1102,7 @@ def converter(*date_cols, col: Hashable): "decimal": ".", # 'engine': 'c', "parse_dates": False, - "keep_date_col": False, "dayfirst": False, - "date_parser": lib.no_default, "date_format": None, "usecols": None, # 'iterator': False, @@ -1245,125 +1117,39 @@ def converter(*date_cols, col: Hashable): def _process_date_conversion( - data_dict, + data_dict: Mapping[Hashable, ArrayLike] | DataFrame, converter: Callable, - parse_spec, + parse_spec: list, index_col, index_names, - columns, - keep_date_col: bool = False, + columns: Sequence[Hashable] | Index, dtype_backend=lib.no_default, -) -> tuple[dict, list]: - def _isindex(colspec): - return (isinstance(index_col, list) and colspec in index_col) or ( +) -> Mapping[Hashable, ArrayLike] | DataFrame: + for colspec in parse_spec: + if isinstance(colspec, int) and colspec not in data_dict: + colspec = columns[colspec] + if (isinstance(index_col, list) and colspec in index_col) or ( isinstance(index_names, list) and colspec in index_names - ) - - new_cols = [] - new_data = {} - - orig_names = columns - columns = list(columns) - - date_cols = set() - - if parse_spec is None or isinstance(parse_spec, bool): - return data_dict, columns - - if isinstance(parse_spec, list): - # list of column lists - for colspec in parse_spec: - if is_scalar(colspec) or isinstance(colspec, tuple): - if isinstance(colspec, int) and colspec not in data_dict: - colspec = orig_names[colspec] - if _isindex(colspec): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - data_dict[colspec] = converter( - np.asarray(data_dict[colspec]), col=colspec - ) - else: - new_name, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names - ) - if new_name in data_dict: - raise ValueError(f"New date column already in dict {new_name}") - new_data[new_name] = col - new_cols.append(new_name) - date_cols.update(old_names) - - elif isinstance(parse_spec, dict): - # dict of new name to column list - for new_name, colspec in parse_spec.items(): - if new_name in data_dict: - raise ValueError(f"Date column {new_name} already in dict") - - _, col, old_names = _try_convert_dates( - converter, - colspec, - data_dict, - orig_names, - target_name=new_name, - ) - - new_data[new_name] = col - - # If original column can be converted to date we keep the converted values - # This can only happen if values are from single column - if len(colspec) == 1: - new_data[colspec[0]] = col - - new_cols.append(new_name) - date_cols.update(old_names) - - if isinstance(data_dict, DataFrame): - data_dict = concat([DataFrame(new_data), data_dict], axis=1) - else: - data_dict.update(new_data) - new_cols.extend(columns) - - if not keep_date_col: - for c in list(date_cols): - data_dict.pop(c) - new_cols.remove(c) - - return data_dict, new_cols - + ): + continue + elif dtype_backend == "pyarrow": + import pyarrow as pa + + dtype = data_dict[colspec].dtype + if isinstance(dtype, ArrowDtype) and ( + pa.types.is_timestamp(dtype.pyarrow_dtype) + or pa.types.is_date(dtype.pyarrow_dtype) + ): + continue -def _try_convert_dates( - parser: Callable, colspec, data_dict, columns, target_name: str | None = None -): - colset = set(columns) - colnames = [] - - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int) and c not in columns: - colnames.append(columns[c]) - else: - colnames.append(c) + # Pyarrow engine returns Series which we need to convert to + # numpy array before converter, its a no-op for other parsers + result = converter(np.asarray(data_dict[colspec]), col=colspec) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data_dict[colspec] = result # type: ignore[index] - new_name: tuple | str - if all(isinstance(x, tuple) for x in colnames): - new_name = tuple(map("_".join, zip(*colnames))) - else: - new_name = "_".join([str(x) for x in colnames]) - to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] - - new_col = parser(*to_parse, col=new_name if target_name is None else target_name) - return new_name, new_col, colnames + return data_dict def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): @@ -1401,26 +1187,5 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): return na_values, na_fvalues -def _validate_parse_dates_arg(parse_dates): - """ - Check whether or not the 'parse_dates' parameter - is a non-boolean scalar. Raises a ValueError if - that is the case. - """ - msg = ( - "Only booleans, lists, and dictionaries are accepted " - "for the 'parse_dates' parameter" - ) - - if not ( - parse_dates is None - or lib.is_bool(parse_dates) - or isinstance(parse_dates, (list, dict)) - ): - raise TypeError(msg) - - return parse_dates - - def is_index_col(col) -> bool: return col is not None and col is not False diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 6e5d36ad39c8a..4de626288aa41 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -166,30 +166,28 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # error: Cannot determine type of 'names' self.orig_names = self.names # type: ignore[has-type] - if not self._has_complex_date_col: - # error: Cannot determine type of 'index_col' - if self._reader.leading_cols == 0 and is_index_col( - self.index_col # type: ignore[has-type] - ): - self._name_processed = True - ( - index_names, - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - self.index_col, - ) = self._clean_index_names( - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - # error: Cannot determine type of 'index_col' - self.index_col, # type: ignore[has-type] - ) + # error: Cannot determine type of 'index_col' + if self._reader.leading_cols == 0 and is_index_col( + self.index_col # type: ignore[has-type] + ): + ( + index_names, + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + self.index_col, + ) = self._clean_index_names( + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + # error: Cannot determine type of 'index_col' + self.index_col, # type: ignore[has-type] + ) - if self.index_names is None: - self.index_names = index_names + if self.index_names is None: + self.index_names = index_names - if self._reader.header is None and not passed_names: - assert self.index_names is not None - self.index_names = [None] * len(self.index_names) + if self._reader.header is None and not passed_names: + assert self.index_names is not None + self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0 @@ -274,9 +272,6 @@ def read( names = self.names # type: ignore[has-type] if self._reader.leading_cols: - if self._has_complex_date_col: - raise NotImplementedError("file structure not yet supported") - # implicit index, no index names arrays = [] @@ -307,12 +302,10 @@ def read( data_tups = sorted(data.items()) data = {k: v for k, (i, v) in zip(names, data_tups)} - column_names, date_data = self._do_date_conversions(names, data) + date_data = self._do_date_conversions(names, data) # maybe create a mi on the columns - column_names = self._maybe_make_multi_index_columns( - column_names, self.col_names - ) + column_names = self._maybe_make_multi_index_columns(names, self.col_names) else: # rename dict keys @@ -335,7 +328,7 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} - names, date_data = self._do_date_conversions(names, data) + date_data = self._do_date_conversions(names, data) index, column_names = self._make_index(date_data, alldata, names) return index, column_names, date_data diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e2456b165fe60..f7d2aa2419429 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -150,14 +150,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: # get popped off for index self.orig_names: list[Hashable] = list(self.columns) - # needs to be cleaned/refactored - # multiple date column thing turning into a real spaghetti factory - - if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = self._get_index_name() - self._name_processed = True - if self.index_names is None: - self.index_names = index_names + index_names, self.orig_names, self.columns = self._get_index_name() + if self.index_names is None: + self.index_names = index_names if self._col_indices is None: self._col_indices = list(range(len(self.columns))) @@ -294,7 +289,7 @@ def read( data, columns = self._exclude_implicit_index(alldata) conv_data = self._convert_data(data) - columns, conv_data = self._do_date_conversions(columns, conv_data) + conv_data = self._do_date_conversions(columns, conv_data) index, result_columns = self._make_index( conv_data, alldata, columns, indexnamerow diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 70f9a68244164..66edbcaa755ed 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.common import ( is_file_like, is_float, - is_hashable, is_integer, is_list_like, pandas_dtype, @@ -118,9 +117,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): na_filter: bool skip_blank_lines: bool parse_dates: bool | Sequence[Hashable] | None - infer_datetime_format: bool | lib.NoDefault - keep_date_col: bool | lib.NoDefault - date_parser: Callable | lib.NoDefault date_format: str | dict[Hashable, str] | None dayfirst: bool cache_dates: bool @@ -137,7 +133,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): encoding_errors: str | None dialect: str | csv.Dialect | None on_bad_lines: str - delim_whitespace: bool | lib.NoDefault low_memory: bool memory_map: bool float_precision: Literal["high", "legacy", "round_trip"] | None @@ -302,19 +297,13 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): performance of reading a large file. skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \ -list}}, default None +parse_dates : bool, None, list of Hashable, default None The behavior is as follows: * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_parser`` or ``date_format`` are - specified. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse - as a single date column. Values are joined with a space before parsing. - * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call - result 'foo'. Values are joined with a space before parsing. If a column or index cannot be represented as an array of ``datetime``, say because of an unparsable value or a mixture of timezones, the column @@ -323,32 +312,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): :func:`~pandas.read_csv`. Note: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : bool, default False - If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the - format of the ``datetime`` strings in the columns, and if it can be inferred, - switch to a faster method of parsing them. In some cases this can increase - the parsing speed by 5-10x. - - .. deprecated:: 2.0.0 - A strict version of this argument is now the default, passing it has no effect. - -keep_date_col : bool, default False - If ``True`` and ``parse_dates`` specifies combining multiple columns then - keep the original columns. -date_parser : Callable, optional - Function to use for converting a sequence of string columns to an array of - ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call ``date_parser`` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by ``parse_dates`` into a single array - and pass that; and 3) call ``date_parser`` once for each row using one or - more strings (corresponding to the columns defined by ``parse_dates``) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`~pandas.to_datetime` as-needed. date_format : str or dict of column -> format, optional Format to use for parsing dates when used in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See @@ -359,9 +322,9 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): You can also pass: - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); + time string (not necessarily in exactly the same format); - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. + and you should probably use it along with `dayfirst`. .. versionadded:: 2.0.0 dayfirst : bool, default False @@ -434,39 +397,33 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): documentation for more details. on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). - Allowed values are : + Allowed values are: - ``'error'``, raise an Exception when a bad line is encountered. - ``'warn'``, raise a warning when a bad line is encountered and skip that line. - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + - With ``engine='pyarrow'``, function with signature + as described in pyarrow documentation: `invalid_row_handler + `_. .. versionadded:: 1.3.0 .. versionadded:: 1.4.0 - - Callable, function with signature - ``(bad_line: list[str]) -> list[str] | None`` that will process a single - bad line. ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. - Only supported when ``engine='python'`` + Callable .. versionchanged:: 2.2.0 - - Callable, function with signature - as described in `pyarrow documentation - `_ when ``engine='pyarrow'`` - -delim_whitespace : bool, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be - used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option - is set to ``True``, nothing should be passed in for the ``delimiter`` - parameter. + Callable for ``engine='pyarrow'`` - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed @@ -512,12 +469,86 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Examples -------- >>> pd.{func_name}('data.csv') # doctest: +SKIP + Name Value +0 foo 1 +1 bar 2 +2 #baz 3 + +Index and header can be specified via the `index_col` and `header` arguments. + +>>> pd.{func_name}('data.csv', header=None) # doctest: +SKIP + 0 1 +0 Name Value +1 foo 1 +2 bar 2 +3 #baz 3 + +>>> pd.{func_name}('data.csv', index_col='Value') # doctest: +SKIP + Name +Value +1 foo +2 bar +3 #baz + +Column types are inferred but can be explicitly specified using the dtype argument. + +>>> pd.{func_name}('data.csv', dtype={{'Value': float}}) # doctest: +SKIP + Name Value +0 foo 1.0 +1 bar 2.0 +2 #baz 3.0 + +True, False, and NA values, and thousands separators have defaults, +but can be explicitly specified, too. Supply the values you would like +as strings or lists of strings! + +>>> pd.{func_name}('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP + Name Value +0 NaN 1 +1 NaN 2 +2 #baz 3 + +Comment lines in the input file can be skipped using the `comment` argument. + +>>> pd.{func_name}('data.csv', comment='#') # doctest: +SKIP + Name Value +0 foo 1 +1 bar 2 + +By default, columns with dates will be read as ``object`` rather than ``datetime``. + +>>> df = pd.{func_name}('tmp.csv') # doctest: +SKIP + +>>> df # doctest: +SKIP + col 1 col 2 col 3 +0 10 10/04/2018 Sun 15 Jan 2023 +1 20 15/04/2018 Fri 12 May 2023 + +>>> df.dtypes # doctest: +SKIP +col 1 int64 +col 2 object +col 3 object +dtype: object + +Specific columns can be parsed as dates by using the `parse_dates` and +`date_format` arguments. + +>>> df = pd.{func_name}( +... 'tmp.csv', +... parse_dates=[1, 2], +... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}}, +... ) # doctest: +SKIP + +>>> df.dtypes # doctest: +SKIP +col 1 int64 +col 2 datetime64[ns] +col 3 datetime64[ns] +dtype: object """ ) class _C_Parser_Defaults(TypedDict): - delim_whitespace: Literal[False] na_filter: Literal[True] low_memory: Literal[True] memory_map: Literal[False] @@ -525,7 +556,6 @@ class _C_Parser_Defaults(TypedDict): _c_parser_defaults: _C_Parser_Defaults = { - "delim_whitespace": False, "na_filter": True, "low_memory": True, "memory_map": False, @@ -551,7 +581,6 @@ class _Fwf_Defaults(TypedDict): "thousands", "memory_map", "dialect", - "delim_whitespace", "quoting", "lineterminator", "converters", @@ -634,13 +663,10 @@ def _read( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds ) -> DataFrame | TextFileReader: """Generic reader of line files.""" - # if we pass a date_parser and parse_dates=False, we should not parse the + # if we pass a date_format and parse_dates=False, we should not parse the # dates GH#44366 if kwds.get("parse_dates", None) is None: - if ( - kwds.get("date_parser", lib.no_default) is lib.no_default - and kwds.get("date_format", None) is None - ): + if kwds.get("date_format", None) is None: kwds["parse_dates"] = False else: kwds["parse_dates"] = True @@ -758,9 +784,6 @@ def read_csv( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -783,67 +806,12 @@ def read_csv( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - if lib.is_list_like(parse_dates): - # GH#55569 - depr = False - # error: Item "bool" of "bool | Sequence[Hashable] | None" has no - # attribute "__iter__" (not iterable) - if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - depr = True - elif isinstance(parse_dates, dict) and any( - lib.is_list_like(x) for x in parse_dates.values() - ): - depr = True - if depr: - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_csv " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -852,7 +820,6 @@ def read_csv( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, @@ -949,9 +916,6 @@ def read_table( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -974,58 +938,12 @@ def read_table( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" - if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - # GH#55569 - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_table " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1034,7 +952,6 @@ def read_table( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, @@ -1341,17 +1258,10 @@ def _clean_options( engine = "python" sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] - if sep is None and not delim_whitespace: - if engine in ("c", "pyarrow"): - fallback_reason = ( - f"the '{engine}' engine does not support " - "sep=None with delim_whitespace=False" - ) - engine = "python" - elif sep is not None and len(sep) > 1: + if sep is not None and len(sep) > 1: if engine == "c" and sep == r"\s+": + # delim_whitespace passed on to pandas._libs.parsers.TextReader result["delim_whitespace"] = True del result["delimiter"] elif engine not in ("python", "python-fwf"): @@ -1362,9 +1272,6 @@ def _clean_options( r"different from '\s+' are interpreted as regex)" ) engine = "python" - elif delim_whitespace: - if "python" in engine: - result["delimiter"] = r"\s+" elif sep is not None: encodeable = True encoding = sys.getfilesystemencoding() or "utf-8" @@ -1671,10 +1578,6 @@ def TextParser(*args, **kwds) -> TextFileReader: comment : str, optional Comment out remainder of line parse_dates : bool, default False - keep_date_col : bool, default False - date_parser : function, optional - - .. deprecated:: 2.0.0 date_format : str or dict of column -> format, default ``None`` .. versionadded:: 2.0.0 @@ -1779,7 +1682,6 @@ def _stringify_na_values(na_values, floatify: bool) -> set[str | float]: def _refine_defaults_read( dialect: str | csv.Dialect | None, delimiter: str | None | lib.NoDefault, - delim_whitespace: bool, engine: CSVEngine | None, sep: str | None | lib.NoDefault, on_bad_lines: str | Callable, @@ -1799,14 +1701,6 @@ def _refine_defaults_read( documentation for more details. delimiter : str or object Alias for sep. - delim_whitespace : bool - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\\s+'``. If this option - is set to True, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. engine : {{'c', 'python'}} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -1826,12 +1720,6 @@ def _refine_defaults_read( ------- kwds : dict Input parameters with correct values. - - Raises - ------ - ValueError : - If a delimiter was specified with ``sep`` (or ``delimiter``) and - ``delim_whitespace=True``. """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] @@ -1862,12 +1750,6 @@ def _refine_defaults_read( if delimiter is None: delimiter = sep - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - if delimiter == "\n": raise ValueError( r"Specified \n as separator or delimiter. This forces the python engine " diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5d325397a81ae..d98c51159eb63 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -125,7 +125,8 @@ npt, ) - from pandas.core.internals import Block + from pandas.core.internals.blocks import Block + # versioning attribute _version = "0.15.2" @@ -2654,7 +2655,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): # reverse converts if dtype.startswith("datetime64"): # recreate with tz if indicated - converted = _set_tz(converted, tz) + converted = _set_tz(converted, tz, dtype) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -3035,7 +3036,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - ret = _set_tz(ret, tz) + ret = _set_tz(ret, tz, dtype) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -4963,7 +4964,9 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: return zone -def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeArray: +def _set_tz( + values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str +) -> DatetimeArray: """ Coerce the values to a DatetimeArray with appropriate tz. @@ -4971,11 +4974,13 @@ def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeA ---------- values : ndarray[int64] tz : str, tzinfo, or None + datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]" """ assert values.dtype == "i8", values.dtype # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; # expected "tzinfo" - dtype = tz_to_dtype(tz=tz, unit="ns") # type: ignore[arg-type] + unit, _ = np.datetime_data(datetime64_dtype) # parsing dtype: unit, count + dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] dta = DatetimeArray._from_sequence(values, dtype=dtype) return dta diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 12d698a4f76a8..6daf4a24781bd 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -124,6 +124,14 @@ def read_sas( DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader, file format is inferred from file extension. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 2c464cc7e90c4..313ffa79cbd09 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -52,6 +52,14 @@ def read_spss( DataFrame DataFrame based on the SPSS file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_sas : Read an SAS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c0007c5e7d78c..c8c9fd99d0165 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame: content = lib.to_object_array_tuples(data) + idx_len = content.shape[0] arrays = convert_object_array( list(content.T), dtype=None, @@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe( result_arrays.append(ArrowExtensionArray(pa_array)) arrays = result_arrays # type: ignore[assignment] if arrays: - df = DataFrame(dict(zip(range(len(columns)), arrays))) - df.columns = columns - return df + return DataFrame._from_arrays( + arrays, columns=columns, index=range(idx_len), verify_integrity=False + ) else: return DataFrame(columns=columns) @@ -1013,7 +1014,7 @@ def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: def insert_data(self) -> tuple[list[str], list[np.ndarray]]: if self.index is not None: - temp = self.frame.copy() + temp = self.frame.copy(deep=False) temp.index.names = self.index try: temp.reset_index(inplace=True) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 47d879c022ee6..d1e57ad568ba5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,7 +91,7 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," "and 119 (Stata 15/16, over 32,767 variables)." ) @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 111, 113, 114, 115]: + if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() self._byteorder = ">" if self._read_int8() == 0x1 else "<" @@ -1405,10 +1405,11 @@ def _read_old_header(self, first_char: bytes) -> None: self._data_label = self._get_data_label() - self._time_stamp = self._get_time_stamp() + if self._format_version >= 105: + self._time_stamp = self._get_time_stamp() # descriptors - if self._format_version > 108: + if self._format_version >= 111: typlist = [int(c) for c in self._path_or_buf.read(self._nvar)] else: buf = self._path_or_buf.read(self._nvar) @@ -1599,14 +1600,13 @@ def _read_strls(self) -> None: v_o = self._read_uint64() else: buf = self._path_or_buf.read(12) - # Only tested on little endian file on little endian machine. + # Only tested on little endian machine. v_size = 2 if self._format_version == 118 else 3 if self._byteorder == "<": buf = buf[0:v_size] + buf[4 : (12 - v_size)] else: - # This path may not be correct, impossible to test - buf = buf[0:v_size] + buf[(4 + v_size) :] - v_o = struct.unpack("Q", buf)[0] + buf = buf[4 - v_size : 4] + buf[(4 + v_size) :] + v_o = struct.unpack(f"{self._byteorder}Q", buf)[0] typ = self._read_uint8() length = self._read_uint32() va = self._path_or_buf.read(length) @@ -3037,6 +3037,8 @@ def __init__( if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) + # Flag whether chosen byteorder matches the system on which we're running + self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder) gso_v_type = "I" # uint32 gso_o_type = "Q" # uint64 @@ -3049,13 +3051,20 @@ def __init__( o_size = 6 else: # version == 119 o_size = 5 - self._o_offet = 2 ** (8 * (8 - o_size)) + if self._native_byteorder: + self._o_offet = 2 ** (8 * (8 - o_size)) + else: + self._o_offet = 2 ** (8 * o_size) self._gso_o_type = gso_o_type self._gso_v_type = gso_v_type def _convert_key(self, key: tuple[int, int]) -> int: v, o = key - return v + self._o_offet * o + if self._native_byteorder: + return v + self._o_offet * o + else: + # v, o will be swapped when applying byteorder + return o + self._o_offet * v def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: """ @@ -3532,7 +3541,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) + ssw = StataStrLWriter( + data, convert_cols, version=self._dta_version, byteorder=self._byteorder + ) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index ea5daf02b7252..0a29ab530c2fc 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -791,6 +791,21 @@ class PlotAccessor(PandasObject): If the backend is not the default matplotlib one, the return value will be the object returned by the backend. + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + DataFrame.hist : Make a histogram. + DataFrame.boxplot : Make a box plot. + DataFrame.plot.scatter : Make a scatter plot with varying marker + point size and color. + DataFrame.plot.hexbin : Make a hexagonal binning plot of + two variables. + DataFrame.plot.kde : Make Kernel Density Estimate plot using + Gaussian kernels. + DataFrame.plot.area : Make a stacked area plot. + DataFrame.plot.bar : Make a bar plot. + DataFrame.plot.barh : Make a horizontal bar plot. + Notes ----- - See matplotlib documentation online for more on this subject @@ -967,10 +982,7 @@ def __call__(self, *args, **kwargs): f"Valid plot kinds: {self._all_kinds}" ) - # The original data structured can be transformed before passed to the - # backend. For example, for DataFrame is common to set the index as the - # `x` parameter, and return a Series with the parameter `y` as values. - data = self._parent.copy() + data = self._parent if isinstance(data, ABCSeries): kwargs["reuse_plot"] = True @@ -990,7 +1002,7 @@ def __call__(self, *args, **kwargs): if is_integer(y) and not holds_integer(data.columns): y = data.columns[y] # converted to series actually. copy to not modify - data = data[y].copy() + data = data[y].copy(deep=False) data.index.name = y elif isinstance(data, ABCDataFrame): data_cols = data.columns @@ -1017,8 +1029,7 @@ def __call__(self, *args, **kwargs): except (IndexError, KeyError, TypeError): pass - # don't overwrite - data = data[y].copy() + data = data[y] if isinstance(data, ABCSeries): label_name = label_kw or y @@ -1583,7 +1594,7 @@ def area( See Also -------- - DataFrame.plot : Make plots of DataFrame using matplotlib / pylab. + DataFrame.plot : Make plots of DataFrame using matplotlib. Examples -------- diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 75c61da03795a..87f3ca09ad346 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -69,7 +69,7 @@ def plot(data, kind, **kwargs): kwargs["ax"] = getattr(ax, "left_ax", ax) plot_obj = PLOT_CLASSES[kind](data, **kwargs) plot_obj.generate() - plot_obj.draw() + plt.draw_if_interactive() return plot_obj.result diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 2a28cd94b64e5..6bb10068bee38 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -7,7 +7,7 @@ ) import warnings -from matplotlib.artist import setp +import matplotlib as mpl import numpy as np from pandas._libs import lib @@ -274,13 +274,13 @@ def maybe_color_bp(bp, color_tup, **kwds) -> None: # GH#30346, when users specifying those arguments explicitly, our defaults # for these four kwargs should be overridden; if not, use Pandas settings if not kwds.get("boxprops"): - setp(bp["boxes"], color=color_tup[0], alpha=1) + mpl.artist.setp(bp["boxes"], color=color_tup[0], alpha=1) if not kwds.get("whiskerprops"): - setp(bp["whiskers"], color=color_tup[1], alpha=1) + mpl.artist.setp(bp["whiskers"], color=color_tup[1], alpha=1) if not kwds.get("medianprops"): - setp(bp["medians"], color=color_tup[2], alpha=1) + mpl.artist.setp(bp["medians"], color=color_tup[2], alpha=1) if not kwds.get("capprops"): - setp(bp["caps"], color=color_tup[3], alpha=1) + mpl.artist.setp(bp["caps"], color=color_tup[3], alpha=1) def _grouped_plot_by_column( @@ -311,8 +311,6 @@ def _grouped_plot_by_column( layout=layout, ) - _axes = flatten_axes(axes) - # GH 45465: move the "by" label based on "vert" xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None) if kwargs.get("vert", True): @@ -322,8 +320,7 @@ def _grouped_plot_by_column( ax_values = [] - for i, col in enumerate(columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), columns): gp_col = grouped[col] keys, values = zip(*gp_col) re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs) @@ -455,7 +452,7 @@ def plot_group(keys, values, ax: Axes, **kwds): if ax is None: rc = {"figure.figsize": figsize} if figsize is not None else {} - with plt.rc_context(rc): + with mpl.rc_context(rc): ax = plt.gca() data = data._get_numeric_data() naxes = len(data.columns) @@ -531,10 +528,8 @@ def boxplot_frame_groupby( figsize=figsize, layout=layout, ) - axes = flatten_axes(axes) - data = {} - for (key, group), ax in zip(grouped, axes): + for (key, group), ax in zip(grouped, flatten_axes(axes)): d = group.boxplot( ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds ) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index e2121526c16af..fc63d65f1e160 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -14,13 +14,8 @@ ) import warnings +import matplotlib as mpl import matplotlib.dates as mdates -from matplotlib.ticker import ( - AutoLocator, - Formatter, - Locator, -) -from matplotlib.transforms import nonsingular import matplotlib.units as munits import numpy as np @@ -174,7 +169,7 @@ def axisinfo(unit, axis) -> munits.AxisInfo | None: if unit != "time": return None - majloc = AutoLocator() + majloc = mpl.ticker.AutoLocator() # pyright: ignore[reportAttributeAccessIssue] majfmt = TimeFormatter(majloc) return munits.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @@ -184,7 +179,7 @@ def default_units(x, axis) -> str: # time formatter -class TimeFormatter(Formatter): +class TimeFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] def __init__(self, locs) -> None: self.locs = locs @@ -430,7 +425,7 @@ def __call__(self): freq = f"{interval}ms" tz = self.tz.tzname(None) st = dmin.replace(tzinfo=None) - ed = dmin.replace(tzinfo=None) + ed = dmax.replace(tzinfo=None) all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) try: @@ -561,7 +556,8 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: return ppd, ppm, ppy -def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] @@ -760,7 +756,8 @@ def _second_finder(label_interval: int) -> None: return info -def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin @@ -831,7 +828,8 @@ def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -878,7 +876,8 @@ def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 @@ -917,7 +916,7 @@ def get_finder(freq: BaseOffset): raise NotImplementedError(f"Unsupported frequency: {dtype_code}") -class TimeSeries_DateLocator(Locator): +class TimeSeries_DateLocator(mpl.ticker.Locator): # pyright: ignore[reportAttributeAccessIssue] """ Locates the ticks along an axis controlled by a :class:`Series`. @@ -998,7 +997,7 @@ def autoscale(self): if vmin == vmax: vmin -= 1 vmax += 1 - return nonsingular(vmin, vmax) + return mpl.transforms.nonsingular(vmin, vmax) # ------------------------------------------------------------------------- @@ -1006,7 +1005,7 @@ def autoscale(self): # ------------------------------------------------------------------------- -class TimeSeries_DateFormatter(Formatter): +class TimeSeries_DateFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`PeriodIndex`. @@ -1082,7 +1081,7 @@ def __call__(self, x, pos: int | None = 0) -> str: return period.strftime(fmt) -class TimeSeries_TimedeltaFormatter(Formatter): +class TimeSeries_TimedeltaFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. """ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index fffeb9b82492f..22be9baf1ff5c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -107,9 +107,7 @@ def _color_in_style(style: str) -> bool: """ Check if there is a color letter in the style string. """ - from matplotlib.colors import BASE_COLORS - - return not set(BASE_COLORS).isdisjoint(style) + return not set(mpl.colors.BASE_COLORS).isdisjoint(style) class MPLPlot(ABC): @@ -176,8 +174,6 @@ def __init__( style=None, **kwds, ) -> None: - import matplotlib.pyplot as plt - # if users assign an empty list or tuple, raise `ValueError` # similar to current `df.box` and `df.hist` APIs. if by in ([], ()): @@ -238,7 +234,7 @@ def __init__( self.rot = self._default_rot if grid is None: - grid = False if secondary_y else plt.rcParams["axes.grid"] + grid = False if secondary_y else mpl.rcParams["axes.grid"] self.grid = grid self.legend = legend @@ -497,10 +493,6 @@ def _get_nseries(self, data: Series | DataFrame) -> int: def nseries(self) -> int: return self._get_nseries(self.data) - @final - def draw(self) -> None: - self.plt.draw_if_interactive() - @final def generate(self) -> None: self._compute_plot_data() @@ -570,6 +562,8 @@ def axes(self) -> Sequence[Axes]: @final @cache_readonly def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: + import matplotlib.pyplot as plt + if self.subplots: naxes = ( self.nseries if isinstance(self.subplots, bool) else len(self.subplots) @@ -584,7 +578,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: layout_type=self._layout_type, ) elif self.ax is None: - fig = self.plt.figure(figsize=self.figsize) + fig = plt.figure(figsize=self.figsize) axes = fig.add_subplot(111) else: fig = self.ax.get_figure() @@ -592,7 +586,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: fig.set_size_inches(self.figsize) axes = self.ax - axes = flatten_axes(axes) + axes = np.fromiter(flatten_axes(axes), dtype=object) if self.logx is True or self.loglog is True: [a.set_xscale("log") for a in axes] @@ -918,13 +912,6 @@ def _get_ax_legend(ax: Axes): ax = other_ax return ax, leg - @final - @cache_readonly - def plt(self): - import matplotlib.pyplot as plt - - return plt - _need_to_set_index = False @final @@ -1219,9 +1206,9 @@ def _get_errorbars( @final def _get_subplots(self, fig: Figure) -> list[Axes]: if Version(mpl.__version__) < Version("3.8"): - from matplotlib.axes import Subplot as Klass + Klass = mpl.axes.Subplot else: - from matplotlib.axes import Axes as Klass + Klass = mpl.axes.Axes return [ ax @@ -1386,7 +1373,7 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): if c is not None and color is not None: raise TypeError("Specify exactly one of `c` and `color`") if c is None and color is None: - c_values = self.plt.rcParams["patch.facecolor"] + c_values = mpl.rcParams["patch.facecolor"] elif color is not None: c_values = color elif color_by_categorical: @@ -1411,12 +1398,10 @@ def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): cmap = None if color_by_categorical and cmap is not None: - from matplotlib import colors - n_cats = len(self.data[c].cat.categories) - cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)]) + cmap = mpl.colors.ListedColormap([cmap(i) for i in range(cmap.N)]) bounds = np.linspace(0, n_cats, n_cats + 1) - norm = colors.BoundaryNorm(bounds, cmap.N) + norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # TODO: warn that we are ignoring self.norm if user specified it? # Doesn't happen in any tests 2023-11-09 else: @@ -1676,8 +1661,6 @@ def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None: ax._stacker_neg_prior[stacking_id] += values # type: ignore[attr-defined] def _post_plot_logic(self, ax: Axes, data) -> None: - from matplotlib.ticker import FixedLocator - def get_label(i): if is_float(i) and i.is_integer(): i = int(i) @@ -1691,7 +1674,7 @@ def get_label(i): xticklabels = [get_label(x) for x in xticks] # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[float]" - ax.xaxis.set_major_locator(FixedLocator(xticks)) # type: ignore[arg-type] + ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(xticks)) # type: ignore[arg-type] ax.set_xticklabels(xticklabels) # If the index is an irregular time series, then by default diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ca635386be335..2c4d714bf1a0c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -95,11 +95,12 @@ def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: """Calculate bins given data""" nd_values = data.infer_objects()._get_numeric_data() - values = np.ravel(nd_values) + values = nd_values.values + if nd_values.ndim == 2: + values = values.reshape(-1) values = values[~isna(values)] - hist, bins = np.histogram(values, bins=bins, range=self._bin_range) - return bins + return np.histogram_bin_edges(values, bins=bins, range=self._bin_range) # error: Signature of "_plot" incompatible with supertype "LinePlot" @classmethod @@ -322,10 +323,7 @@ def _grouped_plot( naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout ) - _axes = flatten_axes(axes) - - for i, (key, group) in enumerate(grouped): - ax = _axes[i] + for ax, (key, group) in zip(flatten_axes(axes), grouped): if numeric_only and isinstance(group, ABCDataFrame): group = group._get_numeric_data() plotf(group, ax, **kwargs) @@ -557,12 +555,9 @@ def hist_frame( figsize=figsize, layout=layout, ) - _axes = flatten_axes(axes) - can_set_label = "label" not in kwds - for i, col in enumerate(data.columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), data.columns): if legend and can_set_label: kwds["label"] = col ax.hist(data[col].dropna().values, bins=bins, **kwds) diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 1f9212587e05e..4a891ec27e8cb 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -3,8 +3,7 @@ import random from typing import TYPE_CHECKING -from matplotlib import patches -import matplotlib.lines as mlines +import matplotlib as mpl import numpy as np from pandas.core.dtypes.missing import notna @@ -129,7 +128,7 @@ def scatter_matrix( def _get_marker_compat(marker): - if marker not in mlines.lineMarkers: + if marker not in mpl.lines.lineMarkers: return "o" return marker @@ -190,10 +189,10 @@ def normalize(series): ) ax.legend() - ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) + ax.add_patch(mpl.patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) for xy, name in zip(s, df.columns): - ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) + ax.add_patch(mpl.patches.Circle(xy, radius=0.025, facecolor="gray")) if xy[0] < 0.0 and xy[1] < 0.0: ax.text( diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index d725d53bd21ec..962f9711d9916 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -260,9 +260,7 @@ def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color] def _get_default_colors(num_colors: int) -> list[Color]: """Get `num_colors` of default colors from matplotlib rc params.""" - import matplotlib.pyplot as plt - - colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]] + colors = [c["color"] for c in mpl.rcParams["axes.prop_cycle"]] return colors[0:num_colors] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index d438f521c0dbc..d95ccad2da565 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -333,7 +333,7 @@ def format_dateaxis( default, changing the limits of the x axis will intelligently change the positions of the ticks. """ - from matplotlib import pylab + import matplotlib.pyplot as plt # handle index specific formatting # Note: DatetimeIndex does not use this @@ -365,4 +365,4 @@ def format_dateaxis( else: raise TypeError("index type not supported") - pylab.draw_if_interactive() + plt.draw_if_interactive() diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 50cfdbd967ea7..f9c370b2486fd 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -5,8 +5,7 @@ from typing import TYPE_CHECKING import warnings -from matplotlib import ticker -import matplotlib.table +import matplotlib as mpl import numpy as np from pandas.util._exceptions import find_stack_level @@ -19,7 +18,10 @@ ) if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import ( + Generator, + Iterable, + ) from matplotlib.axes import Axes from matplotlib.axis import Axis @@ -77,7 +79,7 @@ def table( # error: Argument "cellText" to "table" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[Sequence[str]] | None" - return matplotlib.table.table( + return mpl.table.table( ax, cellText=cellText, # type: ignore[arg-type] rowLabels=rowLabels, @@ -232,7 +234,7 @@ def create_subplots( else: if is_list_like(ax): if squeeze: - ax = flatten_axes(ax) + ax = np.fromiter(flatten_axes(ax), dtype=object) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored.", @@ -261,7 +263,7 @@ def create_subplots( if squeeze: return fig, ax else: - return fig, flatten_axes(ax) + return fig, np.fromiter(flatten_axes(ax), dtype=object) else: warnings.warn( "To output multiple subplots, the figure containing " @@ -327,10 +329,10 @@ def _remove_labels_from_axis(axis: Axis) -> None: # set_visible will not be effective if # minor axis has NullLocator and NullFormatter (default) - if isinstance(axis.get_minor_locator(), ticker.NullLocator): - axis.set_minor_locator(ticker.AutoLocator()) - if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): - axis.set_minor_formatter(ticker.FormatStrFormatter("")) + if isinstance(axis.get_minor_locator(), mpl.ticker.NullLocator): + axis.set_minor_locator(mpl.ticker.AutoLocator()) + if isinstance(axis.get_minor_formatter(), mpl.ticker.NullFormatter): + axis.set_minor_formatter(mpl.ticker.FormatStrFormatter("")) for t in axis.get_minorticklabels(): t.set_visible(False) @@ -440,12 +442,13 @@ def handle_shared_axes( _remove_labels_from_axis(ax.yaxis) -def flatten_axes(axes: Axes | Iterable[Axes]) -> np.ndarray: +def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes, None, None]: if not is_list_like(axes): - return np.array([axes]) + yield axes # type: ignore[misc] elif isinstance(axes, (np.ndarray, ABCIndex)): - return np.asarray(axes).ravel() - return np.array(axes) + yield from np.asarray(axes).reshape(-1) + else: + yield from axes # type: ignore[misc] def set_ticks_props( @@ -455,17 +458,15 @@ def set_ticks_props( ylabelsize: int | None = None, yrot=None, ): - import matplotlib.pyplot as plt - for ax in flatten_axes(axes): if xlabelsize is not None: - plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) # type: ignore[arg-type] if xrot is not None: - plt.setp(ax.get_xticklabels(), rotation=xrot) + mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) # type: ignore[arg-type] if ylabelsize is not None: - plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) # type: ignore[arg-type] if yrot is not None: - plt.setp(ax.get_yticklabels(), rotation=yrot) + mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) # type: ignore[arg-type] return axes diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index af7ddf39283c0..d8455f44ef0d1 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -33,6 +33,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Parameters ---------- ax : Matplotlib axes object + The axes on which to draw the table. data : DataFrame or Series Data for table contents. **kwargs @@ -43,6 +44,12 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Returns ------- matplotlib table object + The created table as a matplotlib Table object. + + See Also + -------- + DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.pyplot.table : Create a table from data in a Matplotlib plot. Examples -------- @@ -472,6 +479,7 @@ def parallel_coordinates( Parameters ---------- frame : DataFrame + The DataFrame to be plotted. class_column : str Column name containing class names. cols : list, optional @@ -498,6 +506,13 @@ def parallel_coordinates( Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the parallel coordinates plot. + + See Also + -------- + plotting.andrews_curves : Generate a matplotlib plot for visualizing clusters + of multivariate data. + plotting.radviz : Plot a multidimensional dataset in 2D. Examples -------- @@ -591,6 +606,12 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the autocorrelation plot. + + See Also + -------- + Series.autocorr : Compute the lag-N autocorrelation for a Series. + plotting.lag_plot : Lag plot for time series. Examples -------- @@ -617,6 +638,14 @@ class _Options(dict): the same as the plot function parameters, but is stored in a canonical format that makes it easy to breakdown into groups later. + See Also + -------- + plotting.register_matplotlib_converters : Register pandas formatters and + converters with matplotlib. + plotting.bootstrap_plot : Bootstrap plot on mean, median and mid-range statistics. + plotting.autocorrelation_plot : Autocorrelation plot for time series. + plotting.lag_plot : Lag plot for time series. + Examples -------- diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 2501ca6c5e1c4..939997f44c1a9 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -63,16 +63,60 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) -def test_apply_args(float_frame, axis, raw, engine, request): - if engine == "numba": - mark = pytest.mark.xfail(reason="numba engine doesn't support args") - request.node.add_marker(mark) +@pytest.mark.parametrize("nopython", [True, False]) +def test_apply_args(float_frame, axis, raw, engine, nopython): + engine_kwargs = {"nopython": nopython} result = float_frame.apply( - lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine + lambda x, y: x + y, + axis, + args=(1,), + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, ) expected = float_frame + 1 tm.assert_frame_equal(result, expected) + # GH:58712 + result = float_frame.apply( + lambda x, a, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + expected = float_frame + 3 + tm.assert_frame_equal(result, expected) + + if engine == "numba": + # keyword-only arguments are not supported in numba + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda x, a, *, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda *x, b: x[0] + x[1] + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + def test_apply_categorical_func(): # GH 9573 @@ -1489,7 +1533,7 @@ def test_apply_dtype(col): def test_apply_mutating(): # GH#35462 case where applied func pins a new BlockManager to a row - df = DataFrame({"a": range(100), "b": range(100, 200)}) + df = DataFrame({"a": range(10), "b": range(10, 20)}) df_orig = df.copy() def func(row): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index b5ad1094f5bf5..3137d3ff50954 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -118,15 +118,15 @@ def test_dict_nested_renaming_depr(method): def test_missing_column(method, func): # GH 40004 obj = DataFrame({"A": [1]}) - match = re.escape("Column(s) ['B'] do not exist") - with pytest.raises(KeyError, match=match): + msg = r"Label\(s\) \['B'\] do not exist" + with pytest.raises(KeyError, match=msg): getattr(obj, method)(func) def test_transform_mixed_column_name_dtypes(): # GH39025 df = DataFrame({"a": ["1"]}) - msg = r"Column\(s\) \[1, 'b'\] do not exist" + msg = r"Label\(s\) \[1, 'b'\] do not exist" with pytest.raises(KeyError, match=msg): df.transform({"a": int, 1: str, "b": int}) @@ -359,3 +359,15 @@ def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper): msg = "Function did not transform" with pytest.raises(ValueError, match=msg): obj.transform(op) + + +def test_transform_missing_labels_raises(): + # GH 58474 + df = DataFrame({"foo": [2, 4, 6], "bar": [1, 2, 3]}, index=["A", "B", "C"]) + msg = r"Label\(s\) \['A', 'B'\] do not exist" + with pytest.raises(KeyError, match=msg): + df.transform({"A": lambda x: x + 2, "B": lambda x: x * 2}, axis=0) + + msg = r"Label\(s\) \['bar', 'foo'\] do not exist" + with pytest.raises(KeyError, match=msg): + df.transform({"foo": lambda x: x + 2, "bar": lambda x: x * 2}, axis=1) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 50cf0f0ed3e84..e224b07a1097b 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import WASM + from pandas.core.dtypes.common import is_number from pandas import ( @@ -28,7 +30,7 @@ ], ) @pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_with_string_funcs(request, float_frame, func, kwds, how): +def test_apply_with_string_funcs(float_frame, func, kwds, how): result = getattr(float_frame, how)(func, **kwds) expected = getattr(float_frame, func)(**kwds) tm.assert_series_equal(result, expected) @@ -54,6 +56,7 @@ def test_apply_np_reducer(op, how): tm.assert_series_equal(result, expected) +@pytest.mark.skipif(WASM, reason="No fp exception support in wasm") @pytest.mark.parametrize( "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"] ) @@ -284,7 +287,7 @@ def test_transform_groupby_kernel_frame(request, float_frame, op): # same thing, but ensuring we have multiple blocks assert "E" not in float_frame.columns float_frame["E"] = float_frame["A"].copy() - assert len(float_frame._mgr.arrays) > 1 + assert len(float_frame._mgr.blocks) > 1 ones = np.ones(float_frame.shape[0]) gb2 = float_frame.groupby(ones) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index d7a8b0510b50f..0730729e2fd94 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -20,13 +20,16 @@ def assert_cannot_add(left, right, msg="cannot add"): """ - Helper to assert that left and right cannot be added. + Helper function to assert that two objects cannot be added. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str, default "cannot add" + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right @@ -36,13 +39,17 @@ def assert_cannot_add(left, right, msg="cannot add"): def assert_invalid_addsub_type(left, right, msg=None): """ - Helper to assert that left and right can be neither added nor subtracted. + Helper function to assert that two objects can + neither be added nor subtracted. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str or None, default None + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index da36b61d465a1..f9807310460b4 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1080,7 +1080,7 @@ def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): @pytest.mark.parametrize("freq", ["h", "D", "W", "2ME", "MS", "QE", "B", None]) @pytest.mark.parametrize("dtype", [None, "uint8"]) def test_dt64arr_addsub_intlike( - self, request, dtype, index_or_series_or_array, freq, tz_naive_fixture + self, dtype, index_or_series_or_array, freq, tz_naive_fixture ): # GH#19959, GH#19123, GH#19012 # GH#55860 use index_or_series_or_array instead of box_with_array diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 18f1993c198df..539df9d61a7b2 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,7 +1361,12 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - msg = "cannot add PeriodArray and DatetimeArray" + if box_with_array is pd.DataFrame: + # TODO: before implementing resolution-inference we got the same + # message with DataFrame and non-DataFrame. Why did that change? + msg = "cannot add PeriodArray and Timestamp" + else: + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 0ecb8f9bef468..4583155502374 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from pandas.compat import WASM from pandas.errors import OutOfBoundsDatetime import pandas as pd @@ -1741,6 +1742,7 @@ def test_td64_div_object_mixed_result(self, box_with_array): # ------------------------------------------------------------------ # __floordiv__, __rfloordiv__ + @pytest.mark.skipif(WASM, reason="no fp exception support in wasm") def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): # GH#35529 box = box_with_array diff --git a/pandas/tests/arrays/categorical/test_map.py b/pandas/tests/arrays/categorical/test_map.py index 763ca9180e53a..585b207c9b241 100644 --- a/pandas/tests/arrays/categorical/test_map.py +++ b/pandas/tests/arrays/categorical/test_map.py @@ -134,16 +134,3 @@ def test_map_with_dict_or_series(na_action): result = cat.map(mapper, na_action=na_action) # Order of categories in result can be different tm.assert_categorical_equal(result, expected) - - -def test_map_na_action_no_default_deprecated(): - # GH51645 - cat = Categorical(["a", "b", "c"]) - msg = ( - "The default value of 'ignore' for the `na_action` parameter in " - "pandas.Categorical.map is deprecated and will be " - "changed to 'None' in a future version. Please set na_action to the " - "desired value to avoid seeing this warning" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - cat.map(lambda x: x) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 9d4b78ce9944e..e3cb9664e19f2 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -121,7 +121,7 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): @pytest.mark.parametrize( "na_value, dtype", [ - (pd.NaT, "datetime64[ns]"), + (pd.NaT, "datetime64[s]"), (None, "float64"), (np.nan, "float64"), (pd.NA, "float64"), diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 857509e18fa8e..97d57163ed079 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -125,7 +125,7 @@ def test_dt64_array(dtype_unit): ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( ["2000", "2001"], @@ -301,11 +301,11 @@ def test_array_copy(): # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"), ), ( np.array([1, 2], dtype="M8[ns]"), @@ -321,7 +321,7 @@ def test_array_copy(): ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="s") ), ), ( @@ -330,7 +330,7 @@ def test_array_copy(): datetime.datetime(2001, 1, 1, tzinfo=cet), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="us") ), ), # timedelta diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8650be62ae7eb..63d60c78da482 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -764,29 +764,14 @@ def test_iter_zoneinfo_fold(self, tz): assert left.utcoffset() == right2.utcoffset() @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2SME", "2SM"), - ("2SME", "2sm"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ("2ME", "2m"), - ("2QE-SEP", "2q-sep"), - ("2YE", "2y"), - ], + "freq", + ["2M", "2SM", "2sm", "2Q", "2Q-SEP", "1Y", "2Y-MAR", "2m", "2q-sep", "2y"], ) - def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): - # GH#9586, GH#54275 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_date_range_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" - expected = pd.date_range("1/1/2000", periods=4, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) def test_date_range_uppercase_frequency_deprecated(self, freq_depr): @@ -800,7 +785,7 @@ def test_date_range_uppercase_frequency_deprecated(self, freq_depr): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "freq_depr", + "freq", [ "2ye-mar", "2ys", @@ -811,17 +796,21 @@ def test_date_range_uppercase_frequency_deprecated(self, freq_depr): "2bms", "2cbme", "2me", - "2w", ], ) - def test_date_range_lowercase_frequency_deprecated(self, freq_depr): + def test_date_range_lowercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + + def test_date_range_lowercase_frequency_deprecated(self): # GH#9586, GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.upper()[1:]}' instead." + depr_msg = "'w' is deprecated and will be removed in a future version" - expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper()) + expected = pd.date_range("1/1/2000", periods=4, freq="2W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + result = pd.date_range("1/1/2000", periods=4, freq="2w") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("freq", ["1A", "2A-MAR", "2a-mar"]) diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..c4b02423f8cf0 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -146,10 +146,12 @@ def test_constructor_datetime_outofbound( # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == "M": + result = constructor(a) + if a.dtype.kind == "M" or isinstance(a[0], np.datetime64): # Can't fit in nanosecond bounds -> get the nearest supported unit - result = constructor(a) assert result.dtype == "M8[s]" + elif isinstance(a[0], datetime): + assert result.dtype == "M8[us]", result.dtype else: result = constructor(a) if using_infer_string and "object-string" in request.node.callspec.id: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index ad35742a7b337..dd6bf3c7521f8 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -270,7 +270,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): ), ], ) -def test_array(arr, attr, index_or_series, request): +def test_array(arr, attr, index_or_series): box = index_or_series result = box(arr, copy=False).array @@ -383,7 +383,7 @@ def test_to_numpy_copy(arr, as_series, using_infer_string): @pytest.mark.parametrize("as_series", [True, False]) -def test_to_numpy_dtype(as_series, unit): +def test_to_numpy_dtype(as_series): tz = "US/Eastern" obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) if as_series: @@ -412,7 +412,7 @@ def test_to_numpy_dtype(as_series, unit): [Timestamp("2000"), Timestamp("2000"), pd.NaT], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) @@ -454,7 +454,7 @@ def test_to_numpy_na_value_numpy_dtype( [(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index ac40e48f3d523..c72abfeb9f3e7 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -47,11 +47,6 @@ def test_value_counts(index_or_series_obj): # i.e IntegerDtype expected = expected.astype("Int64") - # TODO(GH#32514): Order of entries with the same count is inconsistent - # on CI (gh-32449) - if obj.duplicated().any(): - result = result.sort_index() - expected = expected.sort_index() tm.assert_series_equal(result, expected) @@ -89,11 +84,6 @@ def test_value_counts_null(null_obj, index_or_series_obj): expected.index.name = obj.name result = obj.value_counts() - if obj.duplicated().any(): - # TODO(GH#32514): - # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() if not isinstance(result.dtype, np.dtype): if getattr(obj.dtype, "storage", "") == "pyarrow": @@ -106,11 +96,8 @@ def test_value_counts_null(null_obj, index_or_series_obj): expected[null_obj] = 3 result = obj.value_counts(dropna=False) - if obj.duplicated().any(): - # TODO(GH#32514): - # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() + expected = expected.sort_index() + result = result.sort_index() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index d8e5908b0c58f..d52f33fe80434 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1984,9 +1984,8 @@ def test_set_inplace(): tm.assert_series_equal(result_view["A"], expected) -class TestValidate: - @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) - def test_validate_bool_args(self, value): - msg = 'For argument "inplace" expected type bool, received type' - with pytest.raises(ValueError, match=msg): - pd.eval("2+2", inplace=value) +@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) +def test_validate_bool_args(value): + msg = 'For argument "inplace" expected type bool, received type' + with pytest.raises(ValueError, match=msg): + pd.eval("2+2", inplace=value) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 2d959bb16e7d5..d1e4104e16465 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -6,7 +6,6 @@ from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td -import pandas as pd from pandas import ( DataFrame, Series, @@ -79,8 +78,7 @@ def test_astype_different_target_dtype(dtype): def test_astype_numpy_to_ea(): ser = Series([1, 2, 3]) - with pd.option_context("mode.copy_on_write", True): - result = ser.astype("Int64") + result = ser.astype("Int64") assert np.shares_memory(get_array(ser), get_array(result)) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index bc931b53b37d0..eb5177e393936 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -228,12 +228,12 @@ def test_dataframe_from_series_or_index_different_dtype(index_or_series): assert df._mgr._has_no_reference(0) -def test_dataframe_from_series_infer_datetime(): +def test_dataframe_from_series_dont_infer_datetime(): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - df = DataFrame(ser) - assert not np.shares_memory(get_array(ser), get_array(df, 0)) - assert df._mgr._has_no_reference(0) + df = DataFrame(ser) + assert df.dtypes.iloc[0] == np.dtype(object) + assert np.shares_memory(get_array(ser), get_array(df, 0)) + assert not df._mgr._has_no_reference(0) @pytest.mark.parametrize("index", [None, [0, 1, 2]]) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f4282c9c7ac3a..db18cd4aef14e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -830,7 +830,11 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): out = lib.maybe_convert_objects(arr, convert_non_numeric=True) # no OutOfBoundsDatetime/OutOfBoundsTimedeltas - tm.assert_numpy_array_equal(out, arr) + if dtype == "datetime64[ns]": + expected = np.array(["2363-10-04"], dtype="M8[us]") + else: + expected = arr + tm.assert_numpy_array_equal(out, expected) def test_maybe_convert_objects_mixed_datetimes(self): ts = Timestamp("now") diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..e924e38ee5030 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -30,8 +30,9 @@ def test_astype_object_frame(self, all_data): blk = result._mgr.blocks[0] assert isinstance(blk, NumpyBlock), type(blk) assert blk.is_object - assert isinstance(result._mgr.arrays[0], np.ndarray) - assert result._mgr.arrays[0].dtype == np.dtype(object) + arr = result._mgr.blocks[0].values + assert isinstance(arr, np.ndarray) + assert arr.dtype == np.dtype(object) # check that we can compare the dtypes comp = result.dtypes == df.dtypes diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index c32a6a6a115ac..639dc874c9fb9 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -69,7 +69,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_dataframe_from_series(self, data): result = pd.DataFrame(pd.Series(data)) @@ -77,7 +77,7 @@ def test_dataframe_from_series(self, data): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_series_given_mismatched_index_raises(self, data): msg = r"Length of values \(3\) does not match length of index \(5\)" diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 935edce32a0ab..3fa2f50bf4930 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -450,7 +450,7 @@ def test_loc_len1(self, data): df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] assert res.ndim == 1 - assert res._mgr.arrays[0].ndim == 1 + assert res._mgr.blocks[0].ndim == 1 if hasattr(res._mgr, "blocks"): assert res._mgr._block.ndim == 1 diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 225a3301b8b8c..b951d4c35d208 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -299,6 +299,20 @@ def test_factorize_empty(self, data): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_extension_array_equal(uniques, expected_uniques) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + df = pd.DataFrame({"A": data_missing.take([0, 1, 0, 1])}) + expected = pd.DataFrame({"A": data_missing.take([1, 1, 0, 1])}) + result = df.fillna(value=data_missing[1], limit=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + ser = pd.Series(data_missing.take([0, 1, 0, 1])) + expected = pd.Series(data_missing.take([1, 1, 0, 1])) + result = ser.fillna(value=data_missing[1], limit=1) + tm.assert_series_equal(result, expected) + def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 489cd15644d04..24be94443c5ba 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -29,7 +29,7 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 3fb2fc09eaa79..a455b21b9932a 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -202,6 +202,22 @@ def test_setitem_integer_array(self, data, idx, box_in_series): arr[idx] = arr[0] tm.assert_equal(arr, expected) + @pytest.mark.parametrize( + "idx", + [[0, 0, 1], pd.array([0, 0, 1], dtype="Int64"), np.array([0, 0, 1])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array_with_repeats(self, data, idx, box_in_series): + arr = data[:5].copy() + expected = data.take([2, 3, 2, 3, 4]) + + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + arr[idx] = [arr[2], arr[2], arr[3]] + tm.assert_equal(arr, expected) + @pytest.mark.parametrize( "idx, box_in_series", [ diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 504bafc145108..6f18761f77138 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -152,6 +152,22 @@ def test_fillna_with_none(self, data_missing): with pytest.raises(TypeError, match=msg): super().test_fillna_with_none(data_missing) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_series(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 22ac9627f6cda..4bc9562f1895d 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -156,6 +156,16 @@ def test_fillna_with_none(self, data_missing): with pytest.raises(AssertionError): super().test_fillna_with_none(data_missing) + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_series(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + @pytest.mark.parametrize( "limit_area, input_ilocs, expected_ilocs", [ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..f2e9d2321f33e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2905,6 +2905,31 @@ def test_dt_components(): tm.assert_frame_equal(result, expected) +def test_dt_components_large_values(): + ser = pd.Series( + [ + pd.Timedelta("365 days 23:59:59.999000"), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 @@ -3445,7 +3470,7 @@ def test_arrow_floor_division_large_divisor(dtype): def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] - result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]") + result = pd.Series(string_dates, dtype="timestamp[s][pyarrow]") expected = pd.Series( ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True)) ) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 6900d6d67f9d9..ec979ac6d22dc 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -84,6 +84,16 @@ class TestIntervalArray(base.ExtensionTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return op_name in ["min", "max"] + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + @pytest.mark.xfail( reason="Raises with incorrect message bc it disallows *all* listlikes " "instead of just wrong-length listlikes" diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index ca79c13ed44e4..79cfb736941d6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -205,6 +205,18 @@ def test_shift_fill_value(self, data): # np.array shape inference. Shift implementation fails. super().test_shift_fill_value(data) + @skip_nested + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_frame(data_missing) + + @skip_nested + def test_fillna_limit_series(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_series(data_missing) + @skip_nested def test_fillna_copy_frame(self, data_missing): # The "scalar" for this array isn't a scalar. diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5595a9ca44d05..56c023d99bb1c 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -264,6 +264,16 @@ def test_fillna_frame(self, data_missing): tm.assert_frame_equal(result, expected) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + _combine_le_expected_dtype = "Sparse[bool]" def test_fillna_copy_frame(self, data_missing): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index e07024b2e2a09..8da7ac635f293 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -17,9 +17,9 @@ def datetime_frame() -> DataFrame: Columns are ['A', 'B', 'C', 'D'] """ return DataFrame( - np.random.default_rng(2).standard_normal((100, 4)), + np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=100, freq="B"), + index=date_range("2000-01-01", periods=10, freq="B"), ) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 66fc234e79b4d..35e143fcedf7b 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -39,7 +39,7 @@ def test_from_records_with_datetimes(self): expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [("EXPIRY", " d indexer_obj = indexer.astype(object) @@ -724,6 +724,14 @@ def test_getitem_setitem_boolean_multi(self): expected.loc[[0, 2], [1]] = 5 tm.assert_frame_equal(df, expected) + def test_getitem_float_label_positional(self): + # GH 53338 + index = Index([1.5, 2]) + df = DataFrame(range(2), index=index) + result = df[1:2] + expected = DataFrame([1], index=[2.0]) + tm.assert_frame_equal(result, expected) + def test_getitem_setitem_float_labels(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index) @@ -748,12 +756,6 @@ def test_getitem_setitem_float_labels(self): expected = df.iloc[0:2] tm.assert_frame_equal(result, expected) - expected = df.iloc[0:2] - msg = r"The behavior of obj\[i:j\] with a float-dtype index" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df[1:2] - tm.assert_frame_equal(result, expected) - # #2727 index = Index([1.0, 2.5, 3.5, 4.5, 5.0]) df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index) @@ -1019,13 +1021,13 @@ def test_single_element_ix_dont_upcast(self, float_frame): result = df.loc[[0], "b"] tm.assert_series_equal(result, expected) - def test_iloc_callable_tuple_return_value(self): - # GH53769 + def test_iloc_callable_tuple_return_value_raises(self): + # GH53769: Enforced pandas 3.0 df = DataFrame(np.arange(40).reshape(10, 4), index=range(0, 20, 2)) - msg = "callable with iloc" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "Returning a tuple from" + with pytest.raises(ValueError, match=msg): df.iloc[lambda _: (0,)] - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): df.iloc[lambda _: (0,)] = 1 def test_iloc_row(self): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ed81e8c8b8129..15cdc6566b570 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -162,7 +162,7 @@ def test_setitem_dt64_index_empty_columns(self): def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) - df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns") + df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] @@ -340,8 +340,8 @@ def test_setitem_dt64tz(self, timezone_frame): # assert that A & C are not sharing the same base (e.g. they # are copies) # Note: This does not hold with Copy on Write (because of lazy copying) - v1 = df._mgr.arrays[1] - v2 = df._mgr.arrays[2] + v1 = df._mgr.blocks[1].values + v2 = df._mgr.blocks[2].values tm.assert_extension_array_equal(v1, v2) v1base = v1._ndarray.base v2base = v2._ndarray.base @@ -782,20 +782,18 @@ def test_loc_setitem_ea_dtype(self): df.iloc[:, 0] = Series([11], dtype="Int64") tm.assert_frame_equal(df, expected) - def test_setitem_object_inferring(self): + def test_setitem_index_object_dtype_not_inferring(self): # GH#56102 idx = Index([Timestamp("2019-12-31")], dtype=object) df = DataFrame({"a": [1]}) - with tm.assert_produces_warning(FutureWarning, match="infer"): - df.loc[:, "b"] = idx - with tm.assert_produces_warning(FutureWarning, match="infer"): - df["c"] = idx + df.loc[:, "b"] = idx + df["c"] = idx expected = DataFrame( { "a": [1], - "b": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), - "c": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + "b": idx, + "c": idx, } ) tm.assert_frame_equal(df, expected) @@ -840,6 +838,7 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): # object array of datetimes with a tz df["B"] = idx.to_pydatetime() result = df["B"] + expected = expected.dt.as_unit("us") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index fb288e19c6e82..1c3c41e2e0299 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -236,32 +236,30 @@ def test_asfreq_2ME(self, freq, freq_half): "freq, freq_depr", [ ("2ME", "2M"), + ("2ME", "2m"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), ("1BQE", "1BQ"), ("2BQE-SEP", "2BQ-SEP"), - ("1YE", "1Y"), + ("2BQE-SEP", "2bq-sep"), + ("1YE", "1y"), ("2YE-MAR", "2Y-MAR"), ], ) - def test_asfreq_frequency_M_Q_Y_deprecated(self, freq, freq_depr): - # GH#9586, #55978 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_asfreq_frequency_M_Q_Y_raises(self, freq, freq_depr): + msg = f"Invalid frequency: {freq_depr}" index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) - expected = df.asfreq(freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = df.asfreq(freq=freq_depr) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + df.asfreq(freq=freq_depr) @pytest.mark.parametrize( "freq, error_msg", [ ( "2MS", - "MS is not supported as period frequency", + "Invalid frequency: 2MS", ), ( offsets.MonthBegin(), diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 1ebe9920933d1..126899826fac3 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -97,7 +97,7 @@ def test_at_time_raises(self, frame_or_series): def test_at_time_axis(self, axis): # issue 8839 - rng = date_range("1/1/2000", "1/5/2000", freq="5min") + rng = date_range("1/1/2000", "1/2/2000", freq="5min") ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng)))) ts.index, ts.columns = rng, rng diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 8aeab5dacd8b4..99c8ddc643fee 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -199,7 +199,7 @@ def test_combine_first_align_nan(self): # GH 7509 (not fixed) dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) dfb = DataFrame([[4], [5]], columns=["b"]) - assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["a"].dtype == "datetime64[s]" assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) @@ -208,7 +208,7 @@ def test_combine_first_align_nan(self): columns=["a", "b"], ) tm.assert_frame_equal(res, exp) - assert res["a"].dtype == "datetime64[ns]" + assert res["a"].dtype == "datetime64[s]" # TODO: this must be int64 assert res["b"].dtype == "int64" @@ -226,13 +226,13 @@ def test_combine_first_timezone(self, unit): df1 = DataFrame( columns=["UTCdatetime", "abc"], data=data1, - index=pd.date_range("20140627", periods=1), + index=pd.date_range("20140627", periods=1, unit=unit), ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit) df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, - index=pd.date_range("20140628", periods=1), + index=pd.date_range("20140628", periods=1, unit=unit), ) res = df2[["UTCdatetime"]].combine_first(df1) exp = DataFrame( @@ -244,7 +244,7 @@ def test_combine_first_timezone(self, unit): "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], }, columns=["UTCdatetime", "abc"], - index=pd.date_range("20140627", periods=2, freq="D"), + index=pd.date_range("20140627", periods=2, freq="D", unit=unit), dtype=f"datetime64[{unit}, UTC]", ) assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]" @@ -288,18 +288,17 @@ def test_combine_first_timezone3(self, unit): exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - # FIXME: parametrizing over unit breaks on non-nano - def test_combine_first_timezone4(self): + def test_combine_first_timezone4(self, unit): # different tz - dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern", unit=unit) df1 = DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-03", "2015-01-05") + dts2 = pd.date_range("2015-01-03", "2015-01-05", unit=unit) df2 = DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" + assert res["DATE"].dtype == f"datetime64[{unit}, US/Eastern]" def test_combine_first_timezone5(self, unit): dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit) @@ -420,7 +419,11 @@ def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]]) - if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]: + if ( + is_dtype_equal(common_dtype, "object") + or frame.dtypes["b"] == other.dtypes["b"] + or frame.dtypes["b"].kind == frame.dtypes["b"].kind == "M" + ): val = scalar1 else: val = na_value diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 53aa44f264c7a..aeaf80f285f9d 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -214,7 +214,7 @@ def test_corr_item_cache(self): df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache - assert len(df._mgr.arrays) == 2 # i.e. 2 blocks + assert len(df._mgr.blocks) == 2 _ = df.corr(numeric_only=True) @@ -285,7 +285,7 @@ def test_corrwith(self, datetime_frame, dtype): b = datetime_frame.add(noise, axis=0) # make sure order does not matter - b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][len(a) // 2 :]) del b["B"] colcorr = a.corrwith(b, axis=0) @@ -301,7 +301,7 @@ def test_corrwith(self, datetime_frame, dtype): dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index - # non time-series data + def test_corrwith_non_timeseries_data(self): index = ["a", "b", "c", "d", "e"] columns = ["one", "two", "three", "four"] df1 = DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e858c123e4dae..1b852343266aa 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -47,7 +47,7 @@ def test_fillna_on_column_view(self): assert np.isnan(arr[:, 0]).all() # i.e. we didn't create a new 49-column block - assert len(df._mgr.arrays) == 1 + assert len(df._mgr.blocks) == 1 assert np.shares_memory(df.values, arr) def test_fillna_datetime(self, datetime_frame): @@ -60,9 +60,6 @@ def test_fillna_datetime(self, datetime_frame): padded = datetime_frame.ffill() assert np.isnan(padded.loc[padded.index[:5], "A"]).all() - assert ( - padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] - ).all() msg = r"missing 1 required positional argument: 'value'" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/methods/test_infer_objects.py b/pandas/tests/frame/methods/test_infer_objects.py index a824a615b5c29..c7cdcd177403b 100644 --- a/pandas/tests/frame/methods/test_infer_objects.py +++ b/pandas/tests/frame/methods/test_infer_objects.py @@ -25,7 +25,7 @@ def test_infer_objects(self): assert df["a"].dtype == "int64" assert df["b"].dtype == "float64" - assert df["c"].dtype == "M8[ns]" + assert df["c"].dtype == "M8[us]" assert df["d"].dtype == "object" expected = DataFrame( diff --git a/pandas/tests/frame/methods/test_map.py b/pandas/tests/frame/methods/test_map.py index fe9661a3edc1b..9850de14b2092 100644 --- a/pandas/tests/frame/methods/test_map.py +++ b/pandas/tests/frame/methods/test_map.py @@ -158,14 +158,15 @@ def test_map_box(): tm.assert_frame_equal(result, expected) -def test_frame_map_dont_convert_datetime64(): - df = DataFrame({"x1": [datetime(1996, 1, 1)]}) +def test_frame_map_dont_convert_datetime64(unit): + dtype = f"M8[{unit}]" + df = DataFrame({"x1": [datetime(1996, 1, 1)]}, dtype=dtype) df = df.map(lambda x: x + BDay()) df = df.map(lambda x: x + BDay()) result = df.x1.dtype - assert result == "M8[ns]" + assert result == dtype def test_map_function_runs_once(): diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 32ae4c0ff2f50..f35b77da0b547 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -710,14 +710,14 @@ def test_quantile_empty_no_columns(self, interp_method): result = df.quantile( 0.5, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([], index=[], name=0.5, dtype=np.float64) + expected = Series([], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) result = df.quantile( [0.5], numeric_only=True, interpolation=interpolation, method=method ) - expected = DataFrame([], index=[0.5], columns=[]) + expected = DataFrame([], index=[0.5]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) @@ -926,3 +926,12 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis): expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 ) tm.assert_series_equal(result, expected) + + +def test_multi_quantile_numeric_only_retains_columns(): + df = DataFrame(list("abc")) + result = df.quantile([0.5, 0.7], numeric_only=True) + expected = DataFrame(index=[0.5, 0.7]) + tm.assert_frame_equal( + result, expected, check_index_type=True, check_column_type=True + ) diff --git a/pandas/tests/frame/methods/test_reindex_like.py b/pandas/tests/frame/methods/test_reindex_like.py index ce68ec28eec3d..03968dcbb6314 100644 --- a/pandas/tests/frame/methods/test_reindex_like.py +++ b/pandas/tests/frame/methods/test_reindex_like.py @@ -22,9 +22,11 @@ def test_reindex_like(self, float_frame): def test_reindex_like_methods(self, method, expected_values): df = DataFrame({"x": list(range(5))}) - result = df.reindex_like(df, method=method, tolerance=0) + with tm.assert_produces_warning(FutureWarning): + result = df.reindex_like(df, method=method, tolerance=0) tm.assert_frame_equal(df, result) - result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) + with tm.assert_produces_warning(FutureWarning): + result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) tm.assert_frame_equal(df, result) def test_reindex_like_subclass(self): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 72c1a123eac98..4e490e9e344ba 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -320,7 +320,7 @@ def test_shift_categorical1(self, frame_or_series): def get_cat_values(ndframe): # For Series we could just do ._values; for DataFrame # we may be able to do this if we ever have 2D Categoricals - return ndframe._mgr.arrays[0] + return ndframe._mgr.blocks[0].values cat = get_cat_values(obj) @@ -560,7 +560,7 @@ def test_shift_dt64values_int_fill_deprecated(self): # same thing but not consolidated; pre-2.0 we got different behavior df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(1, axis=1, fill_value=0) tm.assert_frame_equal(result, expected) @@ -621,7 +621,7 @@ def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat): # same thing but not consolidated df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(-1, axis=1, fill_value="foo") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 66a35c6f486a4..44794906b8e60 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -33,7 +33,7 @@ def read_csv(self, path, **kwargs): return read_csv(path, **params) - def test_to_csv_from_csv1(self, temp_file, float_frame, datetime_frame): + def test_to_csv_from_csv1(self, temp_file, float_frame): path = str(temp_file) float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan @@ -42,12 +42,16 @@ def test_to_csv_from_csv1(self, temp_file, float_frame, datetime_frame): float_frame.to_csv(path, header=False) float_frame.to_csv(path, index=False) + def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame): + path = str(temp_file) # test roundtrip # freq does not roundtrip datetime_frame.index = datetime_frame.index._with_freq(None) datetime_frame.to_csv(path) recons = self.read_csv(path, parse_dates=True) - tm.assert_frame_equal(datetime_frame, recons) + expected = datetime_frame.copy() + expected.index = expected.index.as_unit("s") + tm.assert_frame_equal(expected, recons) datetime_frame.to_csv(path, index_label="index") recons = self.read_csv(path, index_col=None, parse_dates=True) @@ -59,7 +63,8 @@ def test_to_csv_from_csv1(self, temp_file, float_frame, datetime_frame): recons = self.read_csv(path, index_col=None, parse_dates=True) tm.assert_almost_equal(datetime_frame.values, recons.values) - # corner case + def test_to_csv_from_csv1_corner_case(self, temp_file): + path = str(temp_file) dm = DataFrame( { "s1": Series(range(3), index=np.arange(3, dtype=np.int64)), @@ -146,9 +151,11 @@ def test_to_csv_from_csv5(self, temp_file, timezone_frame): lambda c: to_datetime(result[c]) .dt.tz_convert("UTC") .dt.tz_convert(timezone_frame[c].dt.tz) + .dt.as_unit("ns") ) result["B"] = converter("B") result["C"] = converter("C") + result["A"] = result["A"].dt.as_unit("ns") tm.assert_frame_equal(result, timezone_frame) def test_to_csv_cols_reordering(self, temp_file): @@ -230,8 +237,12 @@ def make_dtnat_arr(n, nnat=None): df = DataFrame({"a": s1, "b": s2}) df.to_csv(path, chunksize=chunksize) - recons = self.read_csv(path).apply(to_datetime) - tm.assert_frame_equal(df, recons, check_names=False) + result = self.read_csv(path).apply(to_datetime) + + expected = df[:] + expected["a"] = expected["a"].astype("M8[s]") + expected["b"] = expected["b"].astype("M8[s]") + tm.assert_frame_equal(result, expected, check_names=False) def _return_result_expected( self, @@ -349,6 +360,7 @@ def test_to_csv_nrows(self, nrows): columns=Index(list("abcd"), dtype=object), ) result, expected = self._return_result_expected(df, 1000, "dt", "s") + expected.index = expected.index.astype("M8[ns]") tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -378,6 +390,10 @@ def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): r_idx_type, c_idx_type, ) + if r_idx_type in ["dt", "p"]: + expected.index = expected.index.astype("M8[ns]") + if c_idx_type in ["dt", "p"]: + expected.columns = expected.columns.astype("M8[ns]") tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -562,7 +578,9 @@ def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame): recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) # TODO to_csv drops column name - tm.assert_frame_equal(tsframe, recons, check_names=False) + expected = tsframe.copy() + expected.index = MultiIndex.from_arrays([old_index.as_unit("s"), new_index[1]]) + tm.assert_frame_equal(recons, expected, check_names=False) # do not load index tsframe.to_csv(path) @@ -738,7 +756,7 @@ def create_cols(name): "foo", index=df_float.index, columns=create_cols("object") ) df_dt = DataFrame( - Timestamp("20010101").as_unit("ns"), + Timestamp("20010101"), index=df_float.index, columns=create_cols("date"), ) @@ -786,9 +804,7 @@ def test_to_csv_dups_cols(self, temp_file): ) df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) - df_dt = DataFrame( - Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3) - ) + df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) df = pd.concat( [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True ) @@ -1164,20 +1180,33 @@ def test_to_csv_with_dst_transitions(self, td, temp_file): # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) - result.index = to_datetime(result.index, utc=True).tz_convert("Europe/London") + result.index = ( + to_datetime(result.index, utc=True) + .tz_convert("Europe/London") + .as_unit("ns") + ) tm.assert_frame_equal(result, df) - def test_to_csv_with_dst_transitions_with_pickle(self, temp_file): + @pytest.mark.parametrize( + "start,end", + [ + ["2015-03-29", "2015-03-30"], + ["2015-10-25", "2015-10-26"], + ], + ) + def test_to_csv_with_dst_transitions_with_pickle(self, start, end, temp_file): # GH11619 - idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") + idx = date_range(start, end, freq="h", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) with tm.ensure_clean("csv_date_format_with_dst") as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) - result.index = to_datetime(result.index, utc=True).tz_convert( - "Europe/Paris" + result.index = ( + to_datetime(result.index, utc=True) + .tz_convert("Europe/Paris") + .as_unit("ns") ) result["idx"] = to_datetime(result["idx"], utc=True).astype( "datetime64[ns, Europe/Paris]" diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 12077952c2e03..f28f811148c5d 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -60,7 +60,7 @@ def test_truncate(self, datetime_frame, frame_or_series): truncated = ts.truncate(before=ts.index[-1] + ts.index.freq) assert len(truncated) == 0 - msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-05-16 00:00:00" + msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-01-11 00:00:00" with pytest.raises(ValueError, match=msg): ts.truncate( before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index dfece3fc7552b..2de2053bb705f 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -256,7 +256,7 @@ def test_private_values_dt64_multiblock(self): df = DataFrame({"A": dta[:4]}, copy=False) df["B"] = dta[4:] - assert len(df._mgr.arrays) == 2 + assert len(df._mgr.blocks) == 2 result = df._values expected = dta.reshape(2, 4).T diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index efbcf8a5cf9dc..3f0e829f66361 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -249,20 +249,19 @@ def f(dtype): with pytest.raises(ValueError, match=msg): f("M8[ns]") - def test_pickle(self, float_string_frame, timezone_frame): - empty_frame = DataFrame() - + def test_pickle_float_string_frame(self, float_string_frame): unpickled = tm.round_trip_pickle(float_string_frame) tm.assert_frame_equal(float_string_frame, unpickled) # buglet float_string_frame._mgr.ndim - # empty + def test_pickle_empty(self): + empty_frame = DataFrame() unpickled = tm.round_trip_pickle(empty_frame) repr(unpickled) - # tz frame + def test_pickle_empty_tz_frame(self, timezone_frame): unpickled = tm.round_trip_pickle(timezone_frame) tm.assert_frame_equal(timezone_frame, unpickled) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 53476c2f7ce38..da0504458cf5d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -121,7 +121,7 @@ def test_construct_ndarray_with_nas_and_int_dtype(self): def test_construct_from_list_of_datetimes(self): df = DataFrame([datetime.now(), datetime.now()]) - assert df[0].dtype == np.dtype("M8[ns]") + assert df[0].dtype == np.dtype("M8[us]") def test_constructor_from_tzaware_datetimeindex(self): # don't cast a DatetimeIndex WITH a tz, leave as object @@ -180,24 +180,24 @@ def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series): arr = arr[:, 0] obj = frame_or_series(arr, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) # go through a different path in internals.construction obj = frame_or_series(frame_or_series(arr), dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) obj = frame_or_series(frame_or_series(arr), dtype=NumpyEADtype(object)) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) if frame_or_series is DataFrame: # other paths through internals.construction sers = [Series(x) for x in arr] obj = frame_or_series(sers, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) def test_series_with_name_not_matching_column(self): # GH#9232 @@ -297,7 +297,7 @@ def test_constructor_dtype_nocast_view_dataframe(self): def test_constructor_dtype_nocast_view_2d_array(self): df = DataFrame([[1, 2], [3, 4]], dtype="int64") df2 = DataFrame(df.values, dtype=df[0].dtype) - assert df2._mgr.arrays[0].flags.c_contiguous + assert df2._mgr.blocks[0].values.flags.c_contiguous @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): @@ -862,6 +862,8 @@ def create_data(constructor): result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) + assert result_datetime.index.unit == "us" + result_datetime.index = result_datetime.index.as_unit("s") result_Timestamp = DataFrame(data_Timestamp) tm.assert_frame_equal(result_datetime64, expected) tm.assert_frame_equal(result_datetime, expected) @@ -1319,7 +1321,7 @@ def test_constructor_unequal_length_nested_list_column(self): [[Timestamp("2021-01-01")]], [{"x": Timestamp("2021-01-01")}], {"x": [Timestamp("2021-01-01")]}, - {"x": Timestamp("2021-01-01").as_unit("ns")}, + {"x": Timestamp("2021-01-01")}, ], ) def test_constructor_one_element_data_list(self, data): @@ -1887,7 +1889,7 @@ def test_constructor_with_datetimes1(self): ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) - assert datetime_s.dtype == "M8[ns]" + assert datetime_s.dtype == "M8[us]" def test_constructor_with_datetimes2(self): # GH 2810 @@ -1898,7 +1900,7 @@ def test_constructor_with_datetimes2(self): df["dates"] = dates result = df.dtypes expected = Series( - [np.dtype("datetime64[ns]"), np.dtype("object")], + [np.dtype("datetime64[us]"), np.dtype("object")], index=["datetimes", "dates"], ) tm.assert_series_equal(result, expected) @@ -1918,7 +1920,7 @@ def test_constructor_with_datetimes3(self): df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -1971,7 +1973,14 @@ def test_constructor_with_datetimes6(self): def test_constructor_datetimes_with_nulls(self, arr): # gh-15869, GH#11220 result = DataFrame(arr).dtypes - expected = Series([np.dtype("datetime64[ns]")]) + unit = "ns" + if isinstance(arr, np.ndarray): + # inferred from a pydatetime object + unit = "us" + elif not any(isinstance(x, np.datetime64) for y in arr for x in y): + # TODO: this condition is not clear about why we have different behavior + unit = "s" + expected = Series([np.dtype(f"datetime64[{unit}]")]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) @@ -2095,7 +2104,7 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): np.dtype("int64"), np.dtype("float64"), np.dtype("object") if not using_infer_string else "string", - np.dtype("datetime64[ns]"), + np.dtype("datetime64[us]"), np.dtype("float64"), ], index=list("abcde"), @@ -2280,7 +2289,7 @@ def test_check_dtype_empty_numeric_column(self, dtype): @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) - def test_check_dtype_empty_string_column(self, request, dtype): + def test_check_dtype_empty_string_column(self, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) @@ -2398,7 +2407,7 @@ class DatetimeSubclass(datetime): pass data = DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) - assert data.datetime.dtype == "datetime64[ns]" + assert data.datetime.dtype == "datetime64[us]" def test_with_mismatched_index_length_raises(self): # GH#33437 @@ -2484,9 +2493,9 @@ def get_base(obj): def check_views(c_only: bool = False): # Check that the underlying data behind df["c"] is still `c` # after setting with iloc. Since we don't know which entry in - # df._mgr.arrays corresponds to df["c"], we just check that exactly + # df._mgr.blocks corresponds to df["c"], we just check that exactly # one of these arrays is `c`. GH#38939 - assert sum(x is c for x in df._mgr.arrays) == 1 + assert sum(x.values is c for x in df._mgr.blocks) == 1 if c_only: # If we ever stop consolidating in setitem_with_indexer, # this will become unnecessary. @@ -2494,17 +2503,17 @@ def check_views(c_only: bool = False): assert ( sum( - get_base(x) is a - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is a + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) assert ( sum( - get_base(x) is b - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is b + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) @@ -2693,21 +2702,14 @@ def test_frame_string_inference_block_dim(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(idx, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": idx}) + obj = klass([Timestamp("2019-12-31")], dtype=object) + result = DataFrame(obj, columns=["a"]) assert result.dtypes.iloc[0] == np.object_ - ser = Series([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(ser, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": ser}) + result = DataFrame({"a": obj}) assert result.dtypes.iloc[0] == np.object_ def test_dict_keys_returns_rangeindex(self): @@ -2715,6 +2717,21 @@ def test_dict_keys_returns_rangeindex(self): expected = RangeIndex(2) tm.assert_index_equal(result, expected, exact=True) + @pytest.mark.parametrize( + "cons", [Series, Index, DatetimeIndex, DataFrame, pd.array, pd.to_datetime] + ) + def test_construction_datetime_resolution_inference(self, cons): + ts = Timestamp(2999, 1, 1) + ts2 = ts.tz_localize("US/Pacific") + + obj = cons([ts]) + res_dtype = tm.get_dtype(obj) + assert res_dtype == "M8[us]", res_dtype + + obj2 = cons([ts2]) + res_dtype2 = tm.get_dtype(obj2) + assert res_dtype2 == "M8[us, US/Pacific]", res_dtype2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): @@ -2846,8 +2863,8 @@ def test_construction_preserves_tzaware_dtypes(self, tz): [ np.dtype("datetime64[ns]"), DatetimeTZDtype(tz=tz), - np.dtype("datetime64[ns]"), - DatetimeTZDtype(tz=tz), + np.dtype("datetime64[us]"), + DatetimeTZDtype(tz=tz, unit="us"), ], index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], ) @@ -2944,7 +2961,8 @@ def test_frame_timeseries_column(self): Timestamp("20130101T10:01:00", tz="US/Eastern"), Timestamp("20130101T10:02:00", tz="US/Eastern"), ] - } + }, + dtype="M8[ns, US/Eastern]", ) tm.assert_frame_equal(result, expected) @@ -2997,9 +3015,9 @@ def test_from_tzaware_mixed_object_array(self): res = DataFrame(arr, columns=["A", "B", "C"]) expected_dtypes = [ - "datetime64[ns]", - "datetime64[ns, US/Eastern]", - "datetime64[ns, CET]", + "datetime64[s]", + "datetime64[s, US/Eastern]", + "datetime64[s, CET]", ] assert (res.dtypes == expected_dtypes).all() @@ -3027,7 +3045,7 @@ def test_construction_from_ndarray_datetimelike(self): # constructed from 2D ndarray arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3) df = DataFrame(arr) - assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays) + assert all(isinstance(block.values, DatetimeArray) for block in df._mgr.blocks) def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): arr = np.random.default_rng(2).standard_normal((10, 2)) @@ -3153,14 +3171,6 @@ def test_from_out_of_bounds_ns_datetime( self, constructor, cls, request, box, frame_or_series ): # scalar that won't fit in nanosecond dt64, but will fit in microsecond - if box is list or (frame_or_series is Series and box is dict): - mark = pytest.mark.xfail( - reason="Timestamp constructor has been updated to cast dt64 to " - "non-nano, but DatetimeArray._from_sequence has not", - strict=True, - ) - request.applymarker(mark) - scalar = datetime(9999, 1, 1) exp_dtype = "M8[us]" # pydatetime objects default to this reso diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 643d342b052a4..ff1bf5632e920 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -202,6 +202,13 @@ def test_eval_simple(self, engine, parser): expected = df["a"] tm.assert_series_equal(expected, res) + def test_extension_array_eval(self, engine, parser): + # GH#58748 + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series([0.25, 0.40, 0.50]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 03db284d892e3..a3a1da6e57cb0 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1321,6 +1321,21 @@ def test_unstack_sort_false(frame_or_series, dtype): [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")] ) obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype) + + result = obj.unstack(level=0, sort=False) + + if frame_or_series is DataFrame: + expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")]) + else: + expected_columns = ["two", "one"] + expected = DataFrame( + [[1.0, 3.0], [2.0, 4.0]], + index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]), + columns=expected_columns, + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) + result = obj.unstack(level=-1, sort=False) if frame_or_series is DataFrame: diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py index 88c62da2b0a73..95b315c32dca5 100644 --- a/pandas/tests/frame/test_ufunc.py +++ b/pandas/tests/frame/test_ufunc.py @@ -245,6 +245,7 @@ def test_alignment_deprecation_enforced(): np.add(s2, df1) +@pytest.mark.single_cpu def test_alignment_deprecation_many_inputs_enforced(): # Enforced in 2.0 # https://github.com/pandas-dev/pandas/issues/39184 diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 0b607d91baf65..b591b1b1092d4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -93,8 +93,7 @@ def test_get_numeric_data(self, frame_or_series): if isinstance(o, DataFrame): # preserve columns dtype expected.columns = o.columns[:0] - # https://github.com/pandas-dev/pandas/issues/50862 - tm.assert_equal(result.reset_index(drop=True), expected) + tm.assert_equal(result, expected) # get the bool data arr = np.array([True, True, False, True]) @@ -102,6 +101,11 @@ def test_get_numeric_data(self, frame_or_series): result = o._get_numeric_data() tm.assert_equal(result, o) + def test_get_bool_data_empty_preserve_index(self): + expected = Series([], dtype="bool") + result = expected._get_bool_data() + tm.assert_series_equal(result, expected, check_index_type=True) + def test_nonzero(self, frame_or_series): # GH 4633 # look at the boolean/nonzero behavior for objects diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2b9df1b7079da..26602baedb594 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -5,7 +5,6 @@ import datetime import functools from functools import partial -import re import numpy as np import pytest @@ -63,6 +62,32 @@ def test_agg_ser_multi_key(df): tm.assert_series_equal(results, expected) +def test_agg_with_missing_values(): + # GH#58810 + missing_df = DataFrame( + { + "nan": [np.nan, np.nan, np.nan, np.nan], + "na": [pd.NA, pd.NA, pd.NA, pd.NA], + "nat": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], + "none": [None, None, None, None], + "values": [1, 2, 3, 4], + } + ) + + result = missing_df.agg(x=("nan", "min"), y=("na", "min"), z=("values", "sum")) + + expected = DataFrame( + { + "nan": [np.nan, np.nan, np.nan], + "na": [np.nan, np.nan, np.nan], + "values": [np.nan, np.nan, 10.0], + }, + index=["x", "y", "z"], + ) + + tm.assert_frame_equal(result, expected) + + def test_groupby_aggregation_mixed_dtype(): # GH 6212 expected = DataFrame( @@ -816,8 +841,8 @@ def test_agg_relabel_other_raises(self): def test_missing_raises(self): df = DataFrame({"A": [0, 1], "B": [1, 2]}) - match = re.escape("Column(s) ['C'] do not exist") - with pytest.raises(KeyError, match=match): + msg = r"Label\(s\) \['C'\] do not exist" + with pytest.raises(KeyError, match=msg): df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): @@ -1663,3 +1688,121 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_groupby_aggregation_duplicate_columns_single_dict_value(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": "sum"}) + + expected = DataFrame( + [[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a") + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_multiple_dict_values(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": ["sum", "min", "max", "min"]}) + + expected = DataFrame( + [[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]], + columns=MultiIndex( + levels=[["c"], ["sum", "min", "max"]], + codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]], + ), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_some_empty_result(): + # GH#55041 + df = DataFrame( + [ + [1, 9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, -546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=["a", "b", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": [], "c": ["var"]}) + + expected = DataFrame( + [[1.509268e11, 30944844.5], [2.178000e03, 0.0]], + columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): "min"}) + + expected = DataFrame( + [[-9843, 9], [244, -33]], + columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): ["min", "max"]}) + + expected = DataFrame( + [[-9843, 940, 9, 546], [244, 244, -33, -33]], + columns=MultiIndex( + levels=[["level1.1"], ["level2.2"], ["min", "max"]], + codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]], + ), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index aafd06e8f88cf..bf9e82480785c 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -285,7 +285,7 @@ def test_read_only_buffer_source_agg(agg): "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) - df._mgr.arrays[0].flags.writeable = False + df._mgr.blocks[0].values.flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 12f99e3cf7a63..78f2917e9a057 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -209,7 +209,7 @@ def test_aggregate_api_consistency(): expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) - msg = r"Column\(s\) \['r', 'r2'\] do not exist" + msg = r"Label\(s\) \['r', 'r2'\] do not exist" with pytest.raises(KeyError, match=msg): grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"}) @@ -224,7 +224,7 @@ def test_agg_dict_renaming_deprecation(): {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} ) - msg = r"Column\(s\) \['ma'\] do not exist" + msg = r"Label\(s\) \['ma'\] do not exist" with pytest.raises(KeyError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index ad35bec70f668..945c3e421a132 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -25,9 +25,12 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): gb = df.groupby(["a", "b", "c"], group_keys=False) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) - - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = method(*args).index expected = df.index @@ -42,18 +45,12 @@ def test_duplicate_columns(request, groupby_func, as_index): df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) args = get_groupby_method_args(groupby_func, df) gb = df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = getattr(gb, groupby_func)(*args) + result = getattr(gb, groupby_func)(*args) expected_df = df.set_axis(["a", "b", "c"], axis=1) expected_args = get_groupby_method_args(groupby_func, expected_df) expected_gb = expected_df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = getattr(expected_gb, groupby_func)(*expected_args) + expected = getattr(expected_gb, groupby_func)(*expected_args) if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) tm.assert_equal(result, expected) @@ -74,8 +71,12 @@ def test_dup_labels_output_shape(groupby_func, idx): grp_by = df.groupby([0]) args = get_groupby_method_args(groupby_func, df) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = getattr(grp_by, groupby_func)(*args) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e27c782c1bdcf..75801b9e039f6 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -322,6 +322,8 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + +def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -379,8 +381,8 @@ def f(piece): {"value": piece, "demeaned": piece - piece.mean(), "logged": logged} ) - dr = bdate_range("1/1/2000", periods=100) - ts = Series(np.random.default_rng(2).standard_normal(100), index=dr) + dr = bdate_range("1/1/2000", periods=10) + ts = Series(np.random.default_rng(2).standard_normal(10), index=dr) grouped = ts.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(f) @@ -639,13 +641,13 @@ def reindex_helper(x): def test_apply_corner_cases(): # #535, can't use sliding iterator - N = 1000 + N = 10 labels = np.random.default_rng(2).integers(0, 100, size=N) df = DataFrame( { "key": labels, "value1": np.random.default_rng(2).standard_normal(N), - "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + "value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5), } ) @@ -680,6 +682,8 @@ def test_apply_numeric_coercion_when_datetime(): result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) + +def test_apply_numeric_coercion_when_datetime_getitem(): # GH 15421 df = DataFrame( {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} @@ -695,6 +699,8 @@ def get_B(g): expected.index = df.A tm.assert_series_equal(result, expected) + +def test_apply_numeric_coercion_when_datetime_with_nat(): # GH 14423 def predictions(tool): out = Series(index=["p1", "p2", "useTime"], dtype=object) @@ -793,7 +799,7 @@ def func_with_date(batch): with tm.assert_produces_warning(DeprecationWarning, match=msg): dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( - {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] + {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1] ) dfg_conversion_expected.index.name = "a" @@ -843,10 +849,24 @@ def test_func(x): tm.assert_frame_equal(result, expected) -def test_groupby_apply_none_first(): +@pytest.mark.parametrize( + "in_data, out_idx, out_data", + [ + [ + {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, + [[1, 1], [0, 2]], + {"groups": [1, 1], "vars": [0, 2]}, + ], + [ + {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, + [[2, 2], [1, 3]], + {"groups": [2, 2], "vars": [1, 3]}, + ], + ], +) +def test_groupby_apply_none_first(in_data, out_idx, out_data): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) - test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) + test_df1 = DataFrame(in_data) def test_func(x): if x.shape[0] < 2: @@ -856,14 +876,9 @@ def test_func(x): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = test_df1.groupby("groups").apply(test_func) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = test_df2.groupby("groups").apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) - expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) - expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) + index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) + expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) def test_groupby_apply_return_empty_chunk(): @@ -883,18 +898,16 @@ def test_groupby_apply_return_empty_chunk(): tm.assert_series_equal(result, expected) -def test_apply_with_mixed_types(): +@pytest.mark.parametrize("meth", ["apply", "transform"]) +def test_apply_with_mixed_types(meth): # gh-20949 df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) g = df.groupby("A", group_keys=False) - result = g.transform(lambda x: x / x.sum()) + result = getattr(g, meth)(lambda x: x / x.sum()) expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x / x.sum()) - tm.assert_frame_equal(result, expected) - def test_func_returns_object(): # GH 28652 @@ -1006,7 +1019,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], + [np.dtype("datetime64[us]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -1106,7 +1119,7 @@ def test_apply_function_with_indexing_return_column(): @pytest.mark.parametrize( "udf", - [(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))], + [lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)], ) @pytest.mark.parametrize("group_keys", [True, False]) def test_apply_result_type(group_keys, udf): @@ -1184,7 +1197,14 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) - _ = getattr(grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + _ = getattr(grp, reduction_func)(*args) result = grp.apply(np.sum, axis=0, include_groups=False) tm.assert_frame_equal(result, expected) @@ -1214,7 +1234,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns="idx") + expected = expected.drop(columns=["idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5a43a42aa936f..010bd9ee52555 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(using_infer_string): # TODO: split this test +def test_basic(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -95,17 +95,20 @@ def test_basic(using_infer_string): # TODO: split this test result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) + +def test_basic_single_grouper(): cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) + +def test_basic_string(using_infer_string): # GH 8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], @@ -133,8 +136,9 @@ def f(x): expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) + +def test_basic_monotonic(): # GH 9921 - # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) @@ -165,7 +169,8 @@ def f(x): tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) - # Non-monotonic + +def test_basic_non_monotonic(): df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) @@ -183,6 +188,8 @@ def f(x): df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] ) + +def test_basic_cut_grouping(): # GH 9603 df = DataFrame({"a": [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) @@ -193,13 +200,14 @@ def f(x): expected.index.name = "a" tm.assert_series_equal(result, expected) - # more basic + +def test_more_basic(): levels = ["foo", "bar", "baz", "qux"] - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() @@ -225,9 +233,9 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal(desc_result.stack().index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal(desc_result.stack().index.get_level_values(1), exp) def test_level_get_group(observed): @@ -352,6 +360,8 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) + +def test_observed_single_column(observed): # https://github.com/pandas-dev/pandas/issues/8138 d = { "cat": Categorical( @@ -362,7 +372,6 @@ def test_observed(observed): } df = DataFrame(d) - # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() @@ -378,7 +387,17 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) - # Grouping on two columns + +def test_observed_two_columns(observed): + # https://github.com/pandas-dev/pandas/issues/8138 + d = { + "cat": Categorical( + ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 1, 2, 2], + "val": [10, 20, 30, 40], + } + df = DataFrame(d) groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg("mean") expected = DataFrame( @@ -404,6 +423,8 @@ def test_observed(observed): expected = df[(df.cat == c) & (df.ints == i)] tm.assert_frame_equal(result, expected) + +def test_observed_with_as_index(observed): # gh-8869 # with as_index d = { @@ -591,7 +612,6 @@ def test_dataframe_categorical_with_nan(observed): @pytest.mark.parametrize("ordered", [True, False]) -@pytest.mark.parametrize("observed", [True, False]) def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # GH 25871: Fix groupby sorting on ordered Categoricals # GH 25167: Groupby with observed=True doesn't sort @@ -627,11 +647,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() @@ -832,7 +852,10 @@ def test_preserve_categories(): df.groupby("A", sort=False, observed=False).first().index, nosort_index ) - # ordered=False + +def test_preserve_categories_ordered_false(): + # GH-13179 + categories = list("abc") df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False, name="A") # GH#48749 - don't change order of categories @@ -846,7 +869,8 @@ def test_preserve_categories(): ) -def test_preserve_categorical_dtype(): +@pytest.mark.parametrize("col", ["C1", "C2"]) +def test_preserve_categorical_dtype(col): # GH13743, GH13854 df = DataFrame( { @@ -865,18 +889,15 @@ def test_preserve_categorical_dtype(): "C2": Categorical(list("bac"), categories=list("bac"), ordered=True), } ) - for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean( - numeric_only=True - ) - result2 = ( - df.groupby(by=col, as_index=True, observed=False) - .mean(numeric_only=True) - .reset_index() - ) - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) + result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) + result2 = ( + df.groupby(by=col, as_index=True, observed=False) + .mean(numeric_only=True) + .reset_index() + ) + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) @pytest.mark.parametrize( @@ -931,6 +952,8 @@ def test_categorical_no_compress(): ) tm.assert_series_equal(result, exp) + +def test_categorical_no_compress_string(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -965,7 +988,7 @@ def test_sort(): # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)}) + df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)}) labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) @@ -1450,7 +1473,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun df_grp = df.groupby(["cat_1", "cat_2"], observed=True) args = get_groupby_method_args(reduction_func, df) - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) for cat in unobserved_cats: assert cat not in res.index @@ -1489,7 +1519,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( getattr(df_grp, reduction_func)(*args) return - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1881,8 +1918,14 @@ def test_category_order_reducer( ): getattr(gb, reduction_func)(*args) return - - op_result = getattr(gb, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 2622895f9f8d2..47ad18c9ad2c8 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -321,19 +321,22 @@ def test_count_object(): expected = Series([3, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) + +def test_count_object_nan(): df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) result = df.groupby("c").a.count() expected = Series([1, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) -def test_count_cross_type(): +@pytest.mark.parametrize("typ", ["object", "float32"]) +def test_count_cross_type(typ): # GH8169 # Set float64 dtype to avoid upcast when setting nan below vals = np.hstack( ( - np.random.default_rng(2).integers(0, 5, (100, 2)), - np.random.default_rng(2).integers(0, 2, (100, 2)), + np.random.default_rng(2).integers(0, 5, (10, 2)), + np.random.default_rng(2).integers(0, 2, (10, 2)), ) ).astype("float64") @@ -341,11 +344,10 @@ def test_count_cross_type(): df[df == 2] = np.nan expected = df.groupby(["c", "d"]).count() - for t in ["float32", "object"]: - df["a"] = df["a"].astype(t) - df["b"] = df["b"].astype(t) - result = df.groupby(["c", "d"]).count() - tm.assert_frame_equal(result, expected) + df["a"] = df["a"].astype(typ) + df["b"] = df["b"].astype(typ) + result = df.groupby(["c", "d"]).count() + tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 28dcb38d173f2..b0a0414c1feb2 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns(): def test_cummin(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] df = base_df.astype(dtype) - expected = DataFrame({"B": expected_mins}).astype(dtype) result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ min value for dtype + +def test_cummin_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected = DataFrame({"B": expected_mins}).astype(dtype) + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = min_val df.loc[[1, 5], "B"] = min_val + 1 expected.loc[[2, 3, 6, 7], "B"] = min_val @@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected, check_exact=True) - # Test nan in some values + +def test_cummin_nan_in_some_values(dtypes_for_minmax): # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) @@ -132,6 +141,8 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummin_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -139,6 +150,8 @@ def test_cummin(dtypes_for_minmax): result = df.groupby("a")["b"].cummin() tm.assert_series_equal(expected, result) + +def test_cummin_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) result = df.groupby("a").b.cummin() @@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype): def test_cummax(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ max value for dtype + +def test_cummax_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = max_val + expected = DataFrame({"B": expected_maxs}).astype(dtype) expected.loc[[2, 3, 6, 7], "B"] = max_val result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) @@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_nan_in_some_values(dtypes_for_minmax): # Test nan in some values # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) @@ -199,6 +224,8 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -206,6 +233,8 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("a")["b"].cummax() tm.assert_series_equal(expected, result) + +def test_cummax_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) result = df.groupby("a").b.cummax() @@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val): tm.assert_frame_equal(result, expected) -def test_cython_api2(): +def test_cython_api2(as_index): # this takes the fast apply path # cumsum (GH5614) + # GH 5755 - cumsum is a transformer and should ignore as_index df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) - result = df.groupby("A").cumsum() - tm.assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby("A", as_index=False).cumsum() + result = df.groupby("A", as_index=as_index).cumsum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index a34170e9b55db..04883b3ef6b78 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -85,6 +85,9 @@ def test_filter_out_no_groups(): grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) + + +def test_filter_out_no_groups_dataframe(): df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) @@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df(): expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) + +def test_filter_out_all_groups_in_df_dropna_true(): + # GH12768 df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) @@ -179,7 +185,7 @@ def test_filter_pdna_is_false(): def test_filter_against_workaround_ints(): # Series of ints - s = Series(np.random.default_rng(2).integers(0, 100, 100)) + s = Series(np.random.default_rng(2).integers(0, 100, 10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -191,7 +197,7 @@ def test_filter_against_workaround_ints(): def test_filter_against_workaround_floats(): # Series of floats - s = 100 * Series(np.random.default_rng(2).random(100)) + s = 100 * Series(np.random.default_rng(2).random(10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -203,13 +209,13 @@ def test_filter_against_workaround_floats(): def test_filter_against_workaround_dataframe(): # Set up DataFrame of ints, floats, strings. letters = np.array(list(ascii_lowercase)) - N = 100 + N = 10 random_letters = letters.take( np.random.default_rng(2).integers(0, 26, N, dtype=int) ) df = DataFrame( { - "ints": Series(np.random.default_rng(2).integers(0, 100, N)), + "ints": Series(np.random.default_rng(2).integers(0, 10, N)), "floats": N / 10 * Series(np.random.default_rng(2).random(N)), "letters": Series(random_letters), } @@ -217,26 +223,26 @@ def test_filter_against_workaround_dataframe(): # Group by ints; filter on floats. grouped = df.groupby("ints") - old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) - old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")] + new_way = grouped.filter(lambda x: len(x.letters) < N / 2) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby("letters") - old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): - # BUG GH4447 + # GH 4447 df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) @@ -250,8 +256,10 @@ def test_filter_using_len(): expected = df.loc[[]] tm.assert_frame_equal(actual, expected) - # Series have always worked properly, but we'll test anyway. - s = df["B"] + +def test_filter_using_len_series(): + # GH 4447 + s = Series(list("aabbbbcc"), name="B") grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") @@ -262,10 +270,14 @@ def test_filter_using_len(): tm.assert_series_equal(actual, expected) -def test_filter_maintains_ordering(): - # Simple case: index is sequential. #4621 +@pytest.mark.parametrize( + "index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]] +) +def test_filter_maintains_ordering(index): + # GH 4621 df = DataFrame( - {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, ) s = df["pid"] grouped = df.groupby("tag") @@ -278,33 +290,6 @@ def test_filter_maintains_ordering(): expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - def test_filter_multiple_timestamp(): # GH 10114 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d50fea459552a..4c1dc8953580a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1235,7 +1235,7 @@ def test_groupby_nat_exclude(): {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} ) assert nan_df["nan"].dtype == "float64" - assert nan_df["nat"].dtype == "datetime64[ns]" + assert nan_df["nat"].dtype == "datetime64[s]" for key in ["nan", "nat"]: grouped = nan_df.groupby(key) @@ -2954,3 +2954,34 @@ def test_groupby_dropna_with_nunique_unique(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_namedagg_with_duplicate_columns(): + # GH#58446 + df = DataFrame( + { + "col1": [2, 1, 1, 0, 2, 0], + "col2": [4, 5, 36, 7, 4, 5], + "col3": [3.1, 8.0, 12, 10, 4, 1.1], + "col4": [17, 3, 16, 15, 5, 6], + "col5": [-1, 3, -1, 3, -2, -1], + } + ) + + result = df.groupby(by=["col1", "col1", "col2"], as_index=False).agg( + new_col=pd.NamedAgg(column="col1", aggfunc="min"), + new_col1=pd.NamedAgg(column="col1", aggfunc="max"), + new_col2=pd.NamedAgg(column="col2", aggfunc="count"), + ) + + expected = DataFrame( + { + "col1": [0, 0, 1, 1, 2], + "col2": [5, 7, 5, 36, 4], + "new_col": [0, 0, 1, 1, 2], + "new_col1": [0, 0, 1, 1, 2], + "new_col2": [1, 1, 1, 1, 2], + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d3b3c945e06de..4749e845a0e59 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -543,7 +543,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki return gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) - expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": expected["x2"] = expected["x2"].cat.remove_categories([4]) @@ -567,7 +574,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki if as_index: expected = expected["size"].rename(None) - result = getattr(gb_keepna, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 063b0ce38387f..39eadd32f300d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -822,6 +822,35 @@ def test_groupby_multiindex_level_empty(self): ) tm.assert_frame_equal(result, expected) + def test_groupby_tuple_keys_handle_multiindex(self): + # https://github.com/pandas-dev/pandas/issues/21340 + df = DataFrame( + { + "num1": [0, 8, 9, 4, 3, 3, 5, 9, 3, 6], + "num2": [3, 8, 6, 4, 9, 2, 1, 7, 0, 9], + "num3": [6, 5, 7, 8, 5, 1, 1, 10, 7, 8], + "category_tuple": [ + (0, 1), + (0, 1), + (0, 1), + (0, 4), + (2, 3), + (2, 3), + (2, 3), + (2, 3), + (5,), + (6,), + ], + "category_string": list("aaabbbbcde"), + } + ) + expected = df.sort_values(by=["category_tuple", "num1"]) + result = df.groupby("category_tuple").apply( + lambda x: x.sort_values(by="num1"), include_groups=False + ) + expected = expected[result.columns] + tm.assert_frame_equal(result.reset_index(drop=True), expected) + # get_group # -------------------------------- diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 33cdd1883e1b9..afbc64429e93c 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -256,7 +256,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method = getattr(gb, kernel) if has_arg and numeric_only is True: # Cases where b does not appear in the result - result = method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg @@ -296,7 +303,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): - method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9301f8d56d9d2..5a8192a9ffe02 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -183,6 +183,8 @@ def test_groupby_raises_string( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -288,6 +290,8 @@ def test_groupby_raises_datetime( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @@ -485,6 +489,8 @@ def test_groupby_raises_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -658,6 +664,8 @@ def test_groupby_raises_category_on_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 245fb9c7babd7..a189d6772ece4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1104,7 +1104,14 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): return args = get_groupby_method_args(reduction_func, obj) - result = g.transform(func, *args) + if func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = g.transform(func, *args) # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) @@ -1232,9 +1239,9 @@ def test_categorical_and_not_categorical_key(observed): tm.assert_frame_equal(result, expected_explicit) # Series case - result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform( - "sum" - ) + gb = df_with_categorical.groupby(["A", "C"], observed=observed) + gbp = gb["B"] + result = gbp.transform("sum") expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") tm.assert_series_equal(result, expected) expected_explicit = Series([4, 2, 4], name="B") @@ -1468,8 +1475,12 @@ def test_as_index_no_change(keys, df, groupby_func): args = get_groupby_method_args(groupby_func, df) gb_as_index_true = df.groupby(keys, as_index=True) gb_as_index_false = df.groupby(keys, as_index=False) - warn = FutureWarning if groupby_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" with tm.assert_produces_warning(warn, match=msg): result = gb_as_index_true.transform(groupby_func, *args) with tm.assert_produces_warning(warn, match=msg): @@ -1535,3 +1546,57 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): result = df.groupby(series, as_index=False).transform("sum") expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int32", "float32"]) +def test_min_one_unobserved_category_no_type_coercion(dtype): + # GH#58084 + df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]}) + df["B"] = df["B"].astype(dtype) + gb = df.groupby("A", observed=False) + result = gb.transform("min") + + expected = DataFrame({"B": [3, 3, 5]}, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +def test_min_all_empty_data_no_type_coercion(): + # GH#58084 + df = DataFrame( + { + "X": Categorical( + [], + categories=[1, "randomcat", 100], + ), + "Y": [], + } + ) + df["Y"] = df["Y"].astype("int32") + + gb = df.groupby("X", observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": []}, dtype="int32") + tm.assert_frame_equal(expected, result) + + +def test_min_one_dim_no_type_coercion(): + # GH#58084 + df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]}) + df["Y"] = df["Y"].astype("int32") + categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5]) + + gb = df.groupby(categories, observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") + tm.assert_frame_equal(expected, result) + + +def test_nan_in_cumsum_group_label(): + # GH#58811 + df = DataFrame({"A": [1, None], "B": [2, 3]}, dtype="Int16") + gb = df.groupby("A")["B"] + result = gb.cumsum() + expected = Series([2, None], dtype="Int16", name="B") + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 338509dd239e6..6036eddce7a01 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -59,22 +59,15 @@ def test_index_string_inference(self): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([pd.Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(idx) - assert result.dtype != np.object_ - - ser = Series([pd.Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(ser) - assert result.dtype != np.object_ + obj = klass([pd.Timestamp("2019-12-31")], dtype=object) + result = Index(obj) + assert result.dtype == np.object_ def test_constructor_not_read_only(self): # GH#57130 ser = Series([1, 2], dtype=object) - with pd.option_context("mode.copy_on_write", True): - idx = Index(ser) - assert idx._values.flags.writeable + idx = Index(ser) + assert idx._values.flags.writeable diff --git a/pandas/tests/indexes/datetimes/methods/test_normalize.py b/pandas/tests/indexes/datetimes/methods/test_normalize.py index 74711f67e6446..0ce28d60b53b9 100644 --- a/pandas/tests/indexes/datetimes/methods/test_normalize.py +++ b/pandas/tests/indexes/datetimes/methods/test_normalize.py @@ -2,6 +2,7 @@ import numpy as np import pytest +from pandas.compat import WASM import pandas.util._test_decorators as td from pandas import ( @@ -70,6 +71,9 @@ def test_normalize_tz(self): assert not rng.is_normalized @td.skip_if_windows + @pytest.mark.skipif( + WASM, reason="tzset is available only on Unix-like systems, not WASM" + ) @pytest.mark.parametrize( "timezone", [ diff --git a/pandas/tests/indexes/datetimes/methods/test_resolution.py b/pandas/tests/indexes/datetimes/methods/test_resolution.py index 8399fafbbaff2..42c3ab0617b7c 100644 --- a/pandas/tests/indexes/datetimes/methods/test_resolution.py +++ b/pandas/tests/indexes/datetimes/methods/test_resolution.py @@ -1,7 +1,10 @@ from dateutil.tz import tzlocal import pytest -from pandas.compat import IS64 +from pandas.compat import ( + IS64, + WASM, +) from pandas import date_range @@ -20,9 +23,10 @@ ("us", "microsecond"), ], ) +@pytest.mark.skipif(WASM, reason="OverflowError received on WASM") def test_dti_resolution(request, tz_naive_fixture, freq, expected): tz = tz_naive_fixture - if freq == "YE" and not IS64 and isinstance(tz, tzlocal): + if freq == "YE" and ((not IS64) or WASM) and isinstance(tz, tzlocal): request.applymarker( pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 5b2cc55d6dc56..8e279162b7012 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -90,24 +90,14 @@ def test_dti_to_period_2monthish(self, freq_offset, freq_period): tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ], + "freq", ["2ME", "1me", "2QE", "2QE-SEP", "1YE", "ye", "2YE-MAR"] ) - def test_to_period_frequency_M_Q_Y_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_to_period_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - assert prng.freq == freq_depr + rng = date_range("01-Jan-2012", periods=8, freq="ME") + with pytest.raises(ValueError, match=msg): + rng.to_period(freq) def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 @@ -208,10 +198,16 @@ def test_to_period_nofreq(self): assert idx.freqstr is None tm.assert_index_equal(idx.to_period(), expected) - @pytest.mark.parametrize("freq", ["2BMS", "1SME-15"]) + @pytest.mark.parametrize("freq", ["2BME", "SME-15", "2BMS"]) def test_to_period_offsets_not_supported(self, freq): # GH#56243 - msg = f"{freq[1:]} is not supported as period frequency" + msg = "|".join( + [ + f"Invalid frequency: {freq}", + f"{freq} is not supported as period frequency", + ] + ) + ts = date_range("1/1/2012", periods=4, freq=freq) with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 4be45e834ce31..43a7cdf63d9b9 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -16,7 +16,6 @@ import pytz from pandas._libs.tslibs import ( - OutOfBoundsDatetime, astype_overflowsafe, timezones, ) @@ -519,7 +518,7 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), Timestamp("2011-01-02 10:00", tz="US/Eastern"), ], - dtype="M8[ns, US/Eastern]", + dtype="M8[s, US/Eastern]", name="idx", ) tm.assert_index_equal(dti, expected) @@ -541,31 +540,25 @@ def test_construction_outofbounds(self): datetime(5000, 1, 1), datetime(6000, 1, 1), ] - exp = Index(dates, dtype=object) - # coerces to object - tm.assert_index_equal(Index(dates), exp) + exp = Index(dates, dtype="M8[us]") + res = Index(dates) + tm.assert_index_equal(res, exp) - msg = "^Out of bounds nanosecond timestamp: 3000-01-01 00:00:00, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - # can't create DatetimeIndex - DatetimeIndex(dates) + DatetimeIndex(dates) @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) def test_dti_date_out_of_range(self, data): # GH#1475 - msg = ( - "^Out of bounds nanosecond timestamp: " - "1400-01-01( 00:00:00)?, at position 0$" - ) - with pytest.raises(OutOfBoundsDatetime, match=msg): - DatetimeIndex(data) + DatetimeIndex(data) def test_construction_with_ndarray(self): # GH 5152 dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] data = DatetimeIndex(dates, freq=offsets.BDay()).values result = DatetimeIndex(data, freq=offsets.BDay()) - expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") + expected = DatetimeIndex( + ["2013-10-07", "2013-10-08", "2013-10-09"], dtype="M8[us]", freq="B" + ) tm.assert_index_equal(result, expected) def test_integer_values_and_tz_interpreted_as_utc(self): @@ -603,7 +596,7 @@ def test_constructor_coverage(self): expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) - from_ints = DatetimeIndex(expected.asi8) + from_ints = DatetimeIndex(expected.as_unit("ns").asi8).as_unit("s") tm.assert_index_equal(from_ints, expected) # string with NaT @@ -612,7 +605,7 @@ def test_constructor_coverage(self): expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) - from_ints = DatetimeIndex(expected.asi8) + from_ints = DatetimeIndex(expected.as_unit("ns").asi8).as_unit("s") tm.assert_index_equal(from_ints, expected) # non-conforming @@ -781,8 +774,10 @@ def test_constructor_timestamp_near_dst(self): Timestamp("2016-10-30 03:00:00+0300", tz="Europe/Helsinki"), Timestamp("2016-10-30 03:00:00+0200", tz="Europe/Helsinki"), ] - result = DatetimeIndex(ts) - expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) + result = DatetimeIndex(ts).as_unit("ns") + expected = DatetimeIndex( + [ts[0].to_pydatetime(), ts[1].to_pydatetime()] + ).as_unit("ns") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @@ -825,7 +820,7 @@ def test_construction_from_replaced_timestamps_with_dst(self): "2005-06-01 00:00:00", ], tz="Australia/Melbourne", - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) def test_construction_with_tz_and_tz_aware_dti(self): @@ -837,8 +832,8 @@ def test_construction_with_tz_and_tz_aware_dti(self): def test_construction_with_nat_and_tzlocal(self): tz = dateutil.tz.tzlocal() - result = DatetimeIndex(["2018", "NaT"], tz=tz) - expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) + result = DatetimeIndex(["2018", "NaT"], tz=tz).as_unit("ns") + expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]).as_unit("ns") tm.assert_index_equal(result, expected) def test_constructor_with_ambiguous_keyword_arg(self): @@ -881,7 +876,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), Timestamp("2015-03-29 04:00:00+02:00", tz=timezone), ] - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) @@ -893,7 +888,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), ] - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) @@ -934,13 +929,16 @@ def test_dti_tz_constructors(self, tzstr): arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = date_range(start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr) + idx2 = date_range( + start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr, unit="s" + ) idx2 = idx2._with_freq(None) # the others all have freq=None - idx3 = DatetimeIndex(arr, tz=tzstr) - idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + idx3 = DatetimeIndex(arr, tz=tzstr).as_unit("s") + idx4 = DatetimeIndex(np.array(arr), tz=tzstr).as_unit("s") - for other in [idx2, idx3, idx4]: - tm.assert_index_equal(idx1, other) + tm.assert_index_equal(idx1, idx2) + tm.assert_index_equal(idx1, idx3) + tm.assert_index_equal(idx1, idx4) def test_dti_construction_idempotent(self, unit): rng = date_range( @@ -1187,9 +1185,9 @@ def test_dti_constructor_object_dtype_dayfirst_yearfirst_with_tz(self): yfirst = Timestamp(2005, 10, 16, tz="US/Pacific") result1 = DatetimeIndex([val], tz="US/Pacific", dayfirst=True) - expected1 = DatetimeIndex([dfirst]) + expected1 = DatetimeIndex([dfirst]).as_unit("s") tm.assert_index_equal(result1, expected1) result2 = DatetimeIndex([val], tz="US/Pacific", yearfirst=True) - expected2 = DatetimeIndex([yfirst]) + expected2 = DatetimeIndex([yfirst]).as_unit("s") tm.assert_index_equal(result2, expected2) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 99d05dd0f26e4..8bf51bcd38862 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -144,24 +144,12 @@ def test_date_range_fractional_period(self): with pytest.raises(TypeError, match=msg): date_range("1/1/2000", periods=10.5) - @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("2ME", "2M"), - ("2SME", "2SM"), - ("2BQE", "2BQ"), - ("2BYE", "2BY"), - ], - ) - def test_date_range_frequency_M_SM_BQ_BY_deprecated(self, freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - expected = date_range("1/1/2000", periods=4, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = date_range("1/1/2000", periods=4, freq=freq_depr) - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq", ["2M", "1m", "2SM", "2BQ", "1bq", "2BY"]) + def test_date_range_frequency_M_SM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=4, freq=freq) def test_date_range_tuple_freq_raises(self): # GH#34703 @@ -777,34 +765,13 @@ def test_frequency_H_T_S_L_U_N_raises(self, freq): date_range("1/1/2000", periods=2, freq=freq) @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("YE", "Y"), - ("YE-MAY", "Y-MAY"), - ], + "freq_depr", ["m", "bm", "CBM", "SM", "BQ", "q-feb", "y-may", "Y-MAY"] ) - def test_frequencies_Y_renamed(self, freq, freq_depr): - # GH#9586, GH#54275 - freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] - freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] - msg = f"'{freq_depr_msg}' is deprecated and will be removed " - f"in a future version, please use '{freq_msg}' instead." - - expected = date_range("1/1/2000", periods=2, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("1/1/2000", periods=2, freq=freq_depr) - tm.assert_index_equal(result, expected) + def test_frequency_raises(self, freq_depr): + msg = f"Invalid frequency: {freq_depr}" - def test_to_offset_with_lowercase_deprecated_freq(self) -> None: - # https://github.com/pandas-dev/pandas/issues/56847 - msg = ( - "'m' is deprecated and will be removed in a future version, please use " - "'ME' instead." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("2010-01-01", periods=2, freq="m") - expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=2, freq=freq_depr) def test_date_range_bday(self): sdate = datetime(1999, 12, 25) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 84a616f05cd63..cc2b802de2a16 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -157,29 +157,12 @@ def test_CBH_deprecated(self): tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "freq, expected_values, freq_depr", - [ - ("2BYE-JUN", ["2016-06-30"], "2BY-JUN"), - ("2BME", ["2016-02-29", "2016-04-29", "2016-06-30"], "2BM"), - ("2BQE", ["2016-03-31"], "2BQ"), - ("1BQE-MAR", ["2016-03-31", "2016-06-30"], "1BQ-MAR"), - ], - ) - def test_BM_BQ_BY_deprecated(self, freq, expected_values, freq_depr): - # GH#52064 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range(start="2016-02-21", end="2016-08-21", freq=freq_depr) - result = DatetimeIndex( - data=expected_values, - dtype="datetime64[ns]", - freq=freq, - ) + @pytest.mark.parametrize("freq", ["2BM", "1bm", "2BQ", "1BQ-MAR", "2BY-JUN", "1by"]) + def test_BM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range(start="2016-02-21", end="2016-08-21", freq=freq) @pytest.mark.parametrize("freq", ["2BA-MAR", "1BAS-MAY", "2AS-AUG"]) def test_BA_BAS_raises(self, freq): diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index f766894a993a0..eb472b099fb1f 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -11,6 +11,8 @@ import locale import unicodedata +from hypothesis import given +import hypothesis.strategies as st import numpy as np import pytest @@ -328,3 +330,122 @@ def test_dti_is_month_start_custom(self): msg = "Custom business days is not supported by is_month_start" with pytest.raises(ValueError, match=msg): dti.is_month_start + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, False, False])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, False, False])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([True, True, True])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, True, True])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_end + tm.assert_numpy_array_equal(result, expected_values) + + def test_dti_is_year_quarter_start_doubledigit_freq(self): + # GH#58523 + dr = date_range("2017-01-01", periods=2, freq="10YS") + assert all(dr.is_year_start) + + dr = date_range("2017-01-01", periods=2, freq="10QS") + assert all(dr.is_quarter_start) + + def test_dti_is_year_start_freq_custom_business_day_with_digit(self): + # GH#58664 + dr = date_range("2020-01-01", periods=2, freq="2C") + msg = "Custom business days is not supported by is_year_start" + with pytest.raises(ValueError, match=msg): + dr.is_year_start + + @pytest.mark.parametrize("freq", ["3BMS", offsets.BusinessMonthBegin(3)]) + def test_dti_is_year_quarter_start_freq_business_month_begin(self, freq): + # GH#58729 + dr = date_range("2020-01-01", periods=5, freq=freq) + result = [x.is_year_start for x in dr] + assert result == [True, False, False, False, True] + + dr = date_range("2020-01-01", periods=4, freq=freq) + result = [x.is_quarter_start for x in dr] + assert all(dr.is_quarter_start) + + +@given( + dt=st.datetimes(min_value=datetime(1960, 1, 1), max_value=datetime(1980, 1, 1)), + n=st.integers(min_value=1, max_value=10), + freq=st.sampled_from(["MS", "QS", "YS"]), +) +@pytest.mark.slow +def test_against_scalar_parametric(freq, dt, n): + # https://github.com/pandas-dev/pandas/issues/49606 + freq = f"{n}{freq}" + d = date_range(dt, periods=3, freq=freq) + result = list(d.is_year_start) + expected = [x.is_year_start for x in d] + assert result == expected diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 676d33d2b0f81..8dde5febe810d 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -397,7 +397,7 @@ def test_constructor_corner(self, dtype): # preventing casting arr = np.array([1, "2", 3, "4"], dtype=object) - msg = "Trying to coerce float values to integers" + msg = "Trying to coerce object values to integers" with pytest.raises(ValueError, match=msg): index_cls(arr, dtype=dtype) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ea305a9766103..8fca53c28a036 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -142,21 +142,24 @@ def test_asfreq_with_different_n(self): tm.assert_series_equal(result, excepted) @pytest.mark.parametrize( - "freq, is_str", + "freq", [ - ("2BMS", True), - ("2YS-MAR", True), - ("2bh", True), - (offsets.MonthBegin(2), False), - (offsets.BusinessMonthEnd(2), False), + "2BMS", + "2YS-MAR", + "2bh", + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), ], ) - def test_pi_asfreq_not_supported_frequency(self, freq, is_str): + def test_pi_asfreq_not_supported_frequency(self, freq): # GH#55785, GH#56945 - if is_str: - msg = f"{freq[1:]} is not supported as period frequency" - else: - msg = re.escape(f"{freq} is not supported as period frequency") + msg = "|".join( + [ + f"Invalid frequency: {freq}", + re.escape(f"{freq} is not supported as period frequency"), + "bh is not supported as period frequency", + ] + ) pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 6aba9f17326ba..aca765e7167b2 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -33,7 +33,7 @@ class TestPeriodIndexDisallowedFreqs: ) def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): # GH#52064 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) @@ -41,20 +41,23 @@ def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): with pytest.raises(ValueError, match=msg): period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) - def test_period_index_frequency_invalid_freq(self, freq_depr): + @pytest.mark.parametrize( + "freq", + ["2SME", "2sme", "2BYE", "2Bye", "2CBME"], + ) + def test_period_index_frequency_invalid_freq(self, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): - period_range("2020-01", "2020-05", freq=freq_depr) + period_range("2020-01", "2020-05", freq=freq) with pytest.raises(ValueError, match=msg): - PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + PeriodIndex(["2020-01", "2020-05"], freq=freq) @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) def test_period_index_from_datetime_index_invalid_freq(self, freq): # GH#56899 - msg = f"Invalid frequency: {freq[1:]}" + msg = f"Invalid frequency: {freq}" rng = date_range("01-Jan-2012", periods=8, freq=freq) with pytest.raises(ValueError, match=msg): @@ -542,9 +545,7 @@ def test_mixed_freq_raises(self): with tm.assert_produces_warning(FutureWarning, match=msg): end_intv = Period("2005-05-01", "B") - msg = "'w' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): - vals = [end_intv, Period("2006-12-31", "w")] + vals = [end_intv, Period("2006-12-31", "W")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" depr_msg = r"PeriodDtype\[B\] is deprecated" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 67f4d7421df23..4e58dc1f324b2 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -181,10 +181,8 @@ def test_construction_from_period(self): def test_mismatched_start_end_freq_raises(self): depr_msg = "Period with BDay freq is deprecated" - msg = "'w' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): - end_w = Period("2006-12-31", "1w") + end_w = Period("2006-12-31", "1W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): start_b = Period("02-Apr-2005", "B") end_b = Period("2005-05-01", "B") @@ -214,14 +212,13 @@ def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2m", "2q-sep", "2y", "2w"]) - def test_lowercase_freq_deprecated_from_time_series(self, freq_depr): + @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y"]) + def test_lowercase_freq_from_time_series_raises(self, freq): # GH#52536, GH#54939 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_depr.upper()[1:]}' instead." + msg = f"Invalid frequency: {freq}" - with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + with pytest.raises(ValueError, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") @pytest.mark.parametrize("freq", ["2A", "2a", "2A-AUG", "2A-aug"]) def test_A_raises_from_time_series(self, freq): @@ -229,3 +226,12 @@ def test_A_raises_from_time_series(self, freq): with pytest.raises(ValueError, match=msg): period_range(freq=freq, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq", ["2w"]) + def test_lowercase_freq_from_time_series_deprecated(self, freq): + # GH#52536, GH#54939 + msg = f"'{freq[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq.upper()[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2e94961b673f8..16908fbb4fecc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -104,16 +104,9 @@ def test_constructor_copy(self, using_infer_string): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(index.astype(object)) - else: - result = Index(index) - - tm.assert_index_equal(result, index) - - if isinstance(index, DatetimeIndex): - assert result.tz == index.tz - if cast_as_obj: + result = Index(index.astype(object)) + assert result.dtype == np.dtype(object) + if isinstance(index, DatetimeIndex): # GH#23524 check that Index(dti, dtype=object) does not # incorrectly raise ValueError, and that nanoseconds are not # dropped @@ -121,6 +114,10 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): result = Index(index, dtype=object) assert result.dtype == np.object_ assert list(result) == list(index) + else: + result = Index(index) + + tm.assert_index_equal(result, index) @pytest.mark.parametrize( "index,has_tz", @@ -186,7 +183,7 @@ def test_constructor_int_dtype_nan(self): "klass,dtype,na_val", [ (Index, np.float64, np.nan), - (DatetimeIndex, "datetime64[ns]", pd.NaT), + (DatetimeIndex, "datetime64[s]", pd.NaT), ], ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): @@ -1561,7 +1558,7 @@ def test_ensure_index_uint64(self): def test_get_combined_index(self): result = _get_combined_index([]) - expected = Index([]) + expected = RangeIndex(0) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index b6e1c3698c258..43445433e2a04 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -270,9 +270,7 @@ def test_searchsorted_monotonic(self, index_flat, request): # all values are the same, expected_right should be length expected_right = len(index) - # test _searchsorted_monotonic in all cases - # test searchsorted only for increasing - if index.is_monotonic_increasing: + if index.is_monotonic_increasing or index.is_monotonic_decreasing: ssm_left = index._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left @@ -284,13 +282,6 @@ def test_searchsorted_monotonic(self, index_flat, request): ss_right = index.searchsorted(value, side="right") assert expected_right == ss_right - - elif index.is_monotonic_decreasing: - ssm_left = index._searchsorted_monotonic(value, side="left") - assert expected_left == ssm_left - - ssm_right = index._searchsorted_monotonic(value, side="right") - assert expected_right == ssm_right else: # non-monotonic should raise. msg = "index must be monotonic increasing or decreasing" diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index b544ebac43ece..4a31ae88a757a 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -61,16 +61,16 @@ def test_infer_nat(self, val): values = [NaT, val] idx = Index(values) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(values[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) @@ -138,6 +138,9 @@ def test_constructor_infer_nat_dt_like( ) expected = klass([NaT, NaT]) + if dtype[0] == "d": + # we infer all-NaT as second resolution + expected = expected.astype("M8[ns]") assert expected.dtype == dtype data = [ctor] data.insert(pos, nulls_fixture) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index b28c3cba7d310..efae0b4dd84cc 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -284,6 +284,7 @@ def test_detect_chained_assignment_changing_dtype(self): with tm.raises_chained_assignment_error(): df.loc[2]["C"] = "foo" tm.assert_frame_equal(df, df_original) + # TODO: Use tm.raises_chained_assignment_error() when PDEP-6 is enforced with tm.raises_chained_assignment_error( extra_warnings=(FutureWarning,), extra_match=(None,) ): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d4bc0341e732e..84cd0d3b08b7b 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -598,7 +598,7 @@ def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01"), "datetime64[s]"), (pd.Timestamp("2012-01-01", tz="US/Eastern"), object), (1, object), ("x", object), @@ -615,7 +615,7 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04"), ] ) - assert obj.dtype == "datetime64[ns]" + assert obj.dtype == "datetime64[s]" exp = klass( [ @@ -630,10 +630,10 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[s, US/Eastern]"), (pd.Timestamp("2012-01-01"), object), # pre-2.0 with a mismatched tz we would get object result - (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[s, US/Eastern]"), (1, object), ("x", object), ], @@ -650,7 +650,7 @@ def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04", tz=tz), ] ) - assert obj.dtype == "datetime64[ns, US/Eastern]" + assert obj.dtype == "datetime64[s, US/Eastern]" if getattr(fill_val, "tz", None) is None: fv = fill_val @@ -830,6 +830,7 @@ def replacer(self, how, from_key, to_key): def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -850,7 +851,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) @@ -867,7 +867,7 @@ def test_replace_series_datetime_tz( self, how, to_key, from_key, replacer, using_infer_string ): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -891,7 +891,7 @@ def test_replace_series_datetime_tz( ) def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -900,8 +900,8 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) - else: - assert exp.dtype == to_key + elif to_key == from_key: + exp = exp.dt.as_unit("ns") result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 8597ee1198ff0..f9b9e8a6c7c28 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -488,24 +488,12 @@ def test_floating_misc(self, indexer_sl): for fancy_idx in [[5, 0], np.array([5, 0])]: tm.assert_series_equal(indexer_sl(s)[fancy_idx], expected) - warn = FutureWarning if indexer_sl is tm.setitem else None - msg = r"The behavior of obj\[i:j\] with a float-dtype index" - # all should return the same as we are slicing 'the same' - with tm.assert_produces_warning(warn, match=msg): - result1 = indexer_sl(s)[2:5] result2 = indexer_sl(s)[2.0:5.0] result3 = indexer_sl(s)[2.0:5] result4 = indexer_sl(s)[2.1:5] - tm.assert_series_equal(result1, result2) - tm.assert_series_equal(result1, result3) - tm.assert_series_equal(result1, result4) - - expected = Series([1, 2], index=[2.5, 5.0]) - with tm.assert_produces_warning(warn, match=msg): - result = indexer_sl(s)[2:5] - - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, result3) + tm.assert_series_equal(result2, result4) # list selection result1 = indexer_sl(s)[[0.0, 5, 10]] diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 172aa9878caec..8b90a6c32849d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -114,7 +114,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, index_or_series_or_array if frame_or_series is Series: values = obj.values else: - values = obj._mgr.arrays[0] + values = obj._mgr.blocks[0].values if frame_or_series is Series: obj.iloc[:2] = index_or_series_or_array(arr[2:]) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 01dab14c7e528..16f3e0fd0c229 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -711,7 +711,7 @@ def test_loc_modify_datetime(self): {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]} ) - df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True) + df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True).dt.as_unit("ms") df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] @@ -865,6 +865,7 @@ def test_loc_setitem_frame_multiples(self): "val": Series([0, 1, 0, 1, 2], dtype=np.int64), } ) + expected["date"] = expected["date"].astype("M8[ns]") rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs @@ -1814,7 +1815,7 @@ def test_loc_getitem_datetime_string_with_datetimeindex(self): result = df.loc[["2010-01-01", "2010-01-05"], ["a", "b"]] expected = DataFrame( {"a": [0, 4], "b": [0, 4]}, - index=DatetimeIndex(["2010-01-01", "2010-01-05"]), + index=DatetimeIndex(["2010-01-01", "2010-01-05"]).as_unit("ns"), ) tm.assert_frame_equal(result, expected) @@ -2082,7 +2083,7 @@ def test_setitem_with_expansion(self): expected = Series([v[0].tz_convert("UTC"), df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == "new", "time"] + Timedelta("1s") + v = df.loc[df.new_col == "new", "time"] + Timedelta("1s").as_unit("s") df.loc[df.new_col == "new", "time"] = v tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b0a041ed5b69c..4d232d5ed1312 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -580,7 +580,7 @@ def test_partial_set_invalid(self): ], ), ( - date_range(start="2000", periods=20, freq="D"), + date_range(start="2000", periods=20, freq="D", unit="s"), ["2000-01-04", "2000-01-08", "2000-01-12"], [ Timestamp("2000-01-04"), diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 60e05c2c65124..64eca6ac643ca 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -603,7 +603,8 @@ def test_empty_dataframe(): ), ( pd.Series( - [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)] + [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], + dtype="M8[ns]", ), (DtypeKind.DATETIME, 64, "tsn:", "="), (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 7ab8988521fdf..591157bbe87fe 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -41,42 +41,18 @@ def test_namespace(): assert set(result) == set(expected + modules) -@pytest.mark.parametrize( - "name", - [ - "Block", - "ExtensionBlock", - "DatetimeTZBlock", - ], -) -def test_deprecations(name): - # GH#55139 - msg = f"{name} is deprecated.* Use public APIs instead" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - getattr(internals, name) - - def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - blk = api.make_block(dti, placement=[0]) + + msg = "make_block is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) -def test_create_block_manager_from_blocks_deprecated(): - # GH#33892 - # If they must, downstream packages should get this from internals.api, - # not internals. - msg = ( - "create_block_manager_from_blocks is deprecated and will be " - "removed in a future version. Use public APIs instead" - ) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - internals.create_block_manager_from_blocks - - def test_create_dataframe_from_blocks(float_frame): block = float_frame._mgr.blocks[0] index = float_frame.index.copy() diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 749e2c4a86b55..fca1ed39c0f9c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1368,8 +1368,10 @@ def test_validate_ndim(): placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - make_block(values, placement, ndim=2) + with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1384,8 +1386,12 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) + depr_msg = "make_block is deprecated" + warn = DeprecationWarning if block_maker is make_block else None + # NumpyExtensionArray, no dtype - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1393,14 +1399,16 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False - # new_block no longer taked dtype keyword + # new_block no longer accepts dtype keyword # ndarray, NumpyEADtype - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta new file mode 100644 index 0000000000000..adfeb6c672333 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-104.dta b/pandas/tests/io/data/stata/stata-compat-104.dta new file mode 100644 index 0000000000000..9bc3659afd31c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-104.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-110.dta b/pandas/tests/io/data/stata/stata-compat-110.dta new file mode 100644 index 0000000000000..68e591aba829a Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-110.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta new file mode 100644 index 0000000000000..0e2ef231f91c0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-104.dta b/pandas/tests/io/data/stata/stata-compat-be-104.dta new file mode 100644 index 0000000000000..98185d8ce27dc Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-104.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-110.dta b/pandas/tests/io/data/stata/stata-compat-be-110.dta new file mode 100644 index 0000000000000..0936be478028c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-110.dta differ diff --git a/pandas/tests/io/data/stata/stata12_118.dta b/pandas/tests/io/data/stata/stata12_118.dta new file mode 100644 index 0000000000000..87c6d1f063150 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_119.dta b/pandas/tests/io/data/stata/stata12_119.dta new file mode 100644 index 0000000000000..fa63f0135738e Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_119.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_117.dta b/pandas/tests/io/data/stata/stata12_be_117.dta new file mode 100644 index 0000000000000..7f84d15fb76d0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_117.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_118.dta b/pandas/tests/io/data/stata/stata12_be_118.dta new file mode 100644 index 0000000000000..9ed6f39b0f9b5 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_119.dta b/pandas/tests/io/data/stata/stata12_be_119.dta new file mode 100644 index 0000000000000..3c9736d0f3af3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_119.dta b/pandas/tests/io/data/stata/stata14_119.dta new file mode 100644 index 0000000000000..e64353213b1c9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_118.dta b/pandas/tests/io/data/stata/stata14_be_118.dta new file mode 100644 index 0000000000000..584ec0984c49e Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_119.dta b/pandas/tests/io/data/stata/stata14_be_119.dta new file mode 100644 index 0000000000000..09d08f7e992ea Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_119.dta b/pandas/tests/io/data/stata/stata16_119.dta new file mode 100644 index 0000000000000..d03c489d4342d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_118.dta b/pandas/tests/io/data/stata/stata16_be_118.dta new file mode 100644 index 0000000000000..bae769c038820 Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_119.dta b/pandas/tests/io/data/stata/stata16_be_119.dta new file mode 100644 index 0000000000000..e928a9713715d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata4_103.dta b/pandas/tests/io/data/stata/stata4_103.dta new file mode 100644 index 0000000000000..3c63935e63df9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_103.dta differ diff --git a/pandas/tests/io/data/stata/stata4_104.dta b/pandas/tests/io/data/stata/stata4_104.dta new file mode 100644 index 0000000000000..c2517355ebff1 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_104.dta differ diff --git a/pandas/tests/io/data/stata/stata4_110.dta b/pandas/tests/io/data/stata/stata4_110.dta new file mode 100644 index 0000000000000..3ea01040448b0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_110.dta differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f0a72ba6163fa..6d6c3ad6b77a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -141,10 +141,13 @@ def df_ref(datapath): def get_exp_unit(read_ext: str, engine: str | None) -> str: - return "ns" + unit = "us" + if (read_ext == ".ods") ^ (engine == "calamine"): + unit = "s" + return unit -def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: +def adjust_expected(expected: DataFrame, read_ext: str, engine: str | None) -> None: expected.index.name = None unit = get_exp_unit(read_ext, engine) # error: "Index" has no attribute "as_unit" @@ -1117,7 +1120,6 @@ def test_read_excel_multiindex_blank_after_name( mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) unit = get_exp_unit(read_ext, engine) - expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -1675,6 +1677,7 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") expected_column_index = MultiIndex.from_arrays( [dti[:1], dti[1:]], diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 508fc47d0920b..744fe20e4995d 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -37,7 +37,9 @@ def get_exp_unit(path: str) -> str: - return "ns" + if path.endswith(".ods"): + return "s" + return "us" @pytest.fixture @@ -293,25 +295,15 @@ def test_read_excel_parse_dates(self, tmp_excel): tm.assert_frame_equal(df2, res) res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) + expected = df[:] + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(res, expected) - date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") - with tm.assert_produces_warning( - FutureWarning, - match="use 'date_format' instead", - raise_on_extra_warnings=False, - ): - res = pd.read_excel( - tmp_excel, - parse_dates=["date_strings"], - date_parser=date_parser, - index_col=0, - ) - tm.assert_frame_equal(df, res) res = pd.read_excel( tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 ) - tm.assert_frame_equal(df, res) + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(expected, res) def test_multiindex_interval_datetimes(self, tmp_excel): # GH 30986 @@ -560,6 +552,7 @@ def test_sheets(self, frame, tmp_excel): columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=5, freq="B"), ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -708,7 +701,6 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # # Excel output format strings unit = get_exp_unit(tmp_excel) - df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], @@ -745,6 +737,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): with ExcelFile(filename2) as reader2: rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + # TODO: why do we get different units? + rs2 = rs2.astype(f"M8[{unit}]") + tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, diff --git a/pandas/tests/io/formats/style/test_matplotlib.py b/pandas/tests/io/formats/style/test_matplotlib.py index 70ddd65c02d14..296fb20d855c4 100644 --- a/pandas/tests/io/formats/style/test_matplotlib.py +++ b/pandas/tests/io/formats/style/test_matplotlib.py @@ -7,11 +7,9 @@ Series, ) -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") pytest.importorskip("jinja2") -import matplotlib as mpl - from pandas.io.formats.style import Styler pytestmark = pytest.mark.usefixtures("mpl_cleanup") diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4065ea01988f..b53957a7e77d1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -133,7 +133,13 @@ def test_frame_non_unique_index_raises(self, orient): [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) - def test_frame_non_unique_columns(self, orient, data): + def test_frame_non_unique_columns(self, orient, data, request): + if isinstance(data[0][0], Timestamp) and orient == "split": + mark = pytest.mark.xfail( + reason="GH#55827 non-nanosecond dt64 fails to round-trip" + ) + request.applymarker(mark) + df = DataFrame(data, index=[1, 2], columns=["x", "x"]) expected_warning = None @@ -141,7 +147,7 @@ def test_frame_non_unique_columns(self, orient, data): "The default 'epoch' date format is deprecated and will be removed " "in a future version, please use 'iso' date format instead." ) - if df.iloc[:, 0].dtype == "datetime64[ns]": + if df.iloc[:, 0].dtype == "datetime64[s]": expected_warning = FutureWarning with tm.assert_produces_warning(expected_warning, match=msg): @@ -150,7 +156,7 @@ def test_frame_non_unique_columns(self, orient, data): ) if orient == "values": expected = DataFrame(data) - if expected.iloc[:, 0].dtype == "datetime64[ns]": + if expected.iloc[:, 0].dtype == "datetime64[s]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need @@ -856,6 +862,10 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): data.append("a") ser = Series(data, index=data) + if not as_object: + ser = ser.astype("M8[ns]") + if isinstance(ser.index, DatetimeIndex): + ser.index = ser.index.as_unit("ns") expected_warning = None if date_format == "epoch": @@ -897,6 +907,7 @@ def test_convert_dates_infer(self, infer_word): expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) + expected[infer_word] = expected[infer_word].astype("M8[ns]") result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 485680d9de48c..b665cfba8bdc0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -22,14 +22,10 @@ from pandas import ( DataFrame, Index, - Timestamp, compat, ) import pandas._testing as tm -from pandas.io.parsers import TextFileReader -from pandas.io.parsers.c_parser_wrapper import CParserWrapper - pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) @@ -38,62 +34,13 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -def test_override_set_noconvert_columns(): - # see gh-17351 - # - # Usecols needs to be sorted in _set_noconvert_columns based - # on the test_usecols_with_parse_dates test from test_usecols.py - class MyTextFileReader(TextFileReader): - def __init__(self) -> None: - self._currow = 0 - self.squeeze = False - - class MyCParserWrapper(CParserWrapper): - def _set_noconvert_columns(self): - if self.usecols_dtype == "integer": - # self.usecols is a set, which is documented as unordered - # but in practice, a CPython set of integers is sorted. - # In other implementations this assumption does not hold. - # The following code simulates a different order, which - # before GH 17351 would cause the wrong columns to be - # converted via the parse_dates parameter - self.usecols = list(self.usecols) - self.usecols.reverse() - return CParserWrapper._set_noconvert_columns(self) - - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - - parse_dates = [[1, 2]] - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - parser = MyTextFileReader() - parser.options = { - "usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ",", - } - parser.engine = "c" - parser._engine = MyCParserWrapper(StringIO(data), **parser.options) - - result = parser.read() - tm.assert_frame_equal(result, expected) - - def test_read_csv_local(all_parsers, csv1): prefix = "file:///" if compat.is_platform_windows() else "file://" parser = all_parsers fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") + expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -115,6 +62,7 @@ def test_read_csv_local(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -195,9 +143,6 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -219,6 +164,7 @@ def test_read_csv_dataframe(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -473,65 +419,43 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): @pytest.mark.parametrize( - "kwargs,expected", + "kwargs,expected_data", [ # gh-8661, gh-8679: this should ignore six lines, including # lines with trailing whitespace and blank lines. ( { "header": None, - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [0, 1, 2, 3, 5, 6], "skip_blank_lines": True, }, - DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + [[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]], ), # gh-8983: test skipping set of rows after a row with trailing spaces. ( { - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [1, 2, 3, 5, 6], "skip_blank_lines": True, }, - DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + {"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}, ), ], ) -def test_trailing_spaces(all_parsers, kwargs, expected): +def test_trailing_spaces(all_parsers, kwargs, expected_data): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 parser = all_parsers - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + with pytest.raises(ValueError, match="the 'pyarrow' engine does not support"): + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + expected = DataFrame(expected_data) + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) tm.assert_frame_equal(result, expected) -def test_raise_on_sep_with_delim_whitespace(all_parsers): - # see gh-6607 - data = "a b c\n1 2 3" - parser = all_parsers - - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with pytest.raises(ValueError, match="you can only specify one"): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) - - def test_read_filepath_or_buffer(all_parsers): # see gh-43366 parser = all_parsers @@ -540,8 +464,7 @@ def test_read_filepath_or_buffer(all_parsers): parser.read_csv(filepath_or_buffer=b"input") -@pytest.mark.parametrize("delim_whitespace", [True, False]) -def test_single_char_leading_whitespace(all_parsers, delim_whitespace): +def test_single_char_leading_whitespace(all_parsers): # see gh-9710 parser = all_parsers data = """\ @@ -551,28 +474,16 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): a b\n""" - expected = DataFrame({"MyColumn": list("abab")}) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if parser.engine == "pyarrow": msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), - skipinitialspace=True, - delim_whitespace=delim_whitespace, - ) + parser.read_csv( + StringIO(data), + skipinitialspace=True, + ) return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) + expected = DataFrame({"MyColumn": list("abab")}) + result = parser.read_csv(StringIO(data), skipinitialspace=True, sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -815,49 +726,6 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser.read_csv(StringIO(data), names=set("QAZ")) -def test_read_table_delim_whitespace_default_sep(all_parsers): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - - depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_table(f, delim_whitespace=True) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_table(f, delim_whitespace=True) - expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) - - def test_read_csv_delimiter_and_sep_no_default(all_parsers): # GH#39823 f = StringIO("a,b\n1,2") @@ -883,26 +751,6 @@ def test_read_csv_line_break_as_separator(kwargs, all_parsers): parser.read_csv(StringIO(data), **kwargs) -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) - - @skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index c93c80a7bb084..ba31a9bc15fb5 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -15,6 +15,7 @@ import numpy as np import pytest +from pandas.compat import WASM from pandas.errors import ( EmptyDataError, ParserError, @@ -80,6 +81,7 @@ def test_path_path_lib(all_parsers): tm.assert_frame_equal(df, result) +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError @@ -93,6 +95,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 2fcc80f58ae30..4cfc12cdc46aa 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -260,7 +260,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): datetime(2000, 1, 5), datetime(2000, 1, 6), datetime(2000, 1, 7), - ] + ], + dtype="M8[s]", ), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index bd47e045417ce..ed2e729430b01 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -250,19 +250,17 @@ def test_null_byte_char(request, all_parsers): @pytest.mark.filterwarnings("always::ResourceWarning") -def test_open_file(request, all_parsers): +def test_open_file(all_parsers): # GH 39024 parser = all_parsers msg = "Could not determine delimiter" err = csv.Error if parser.engine == "c": - msg = "the 'c' engine does not support sep=None with delim_whitespace=False" - err = ValueError + msg = "object of type 'NoneType' has no len" + err = TypeError elif parser.engine == "pyarrow": - msg = ( - "the 'pyarrow' engine does not support sep=None with delim_whitespace=False" - ) + msg = "'utf-8' codec can't decode byte 0xe4" err = ValueError with tm.ensure_clean() as path: diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 98a460f221592..39718ca2ec134 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -18,6 +18,7 @@ import numpy as np import pytest +from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -52,11 +53,7 @@ def test_delim_whitespace_custom_terminator(c_parser_only): data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + df = parser.read_csv(StringIO(data), lineterminator="~", sep=r"\s+") expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) @@ -94,15 +91,16 @@ def test_dtype_and_names_error(c_parser_only): """ # fallback casting, but not castable warning = RuntimeWarning if np_version_gte1p24 else None - with pytest.raises(ValueError, match="cannot safely convert"): - with tm.assert_produces_warning(warning, check_stacklevel=False): - parser.read_csv( - StringIO(data), - sep=r"\s+", - header=None, - names=["a", "b"], - dtype={"a": np.int32}, - ) + if not WASM: # no fp exception support in wasm + with pytest.raises(ValueError, match="cannot safely convert"): + with tm.assert_produces_warning(warning, check_stacklevel=False): + parser.read_csv( + StringIO(data), + sep=r"\s+", + header=None, + names=["a", "b"], + dtype={"a": np.int32}, + ) @pytest.mark.parametrize( @@ -550,6 +548,7 @@ def test_chunk_whitespace_on_boundary(c_parser_only): tm.assert_frame_equal(result, expected) +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_file_handles_mmap(c_parser_only, csv1): # gh-14418 # diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index ca8df520b171e..ba27b170aecdc 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -31,10 +31,8 @@ def test_comment(all_parsers, na_values): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}] -) -def test_line_comment(all_parsers, read_kwargs, request): +@pytest.mark.parametrize("read_kwargs", [{}, {"lineterminator": "*"}, {"sep": r"\s+"}]) +def test_line_comment(all_parsers, read_kwargs): parser = all_parsers data = """# empty A,B,C @@ -42,12 +40,8 @@ def test_line_comment(all_parsers, read_kwargs, request): #ignore this line 5.,NaN,10.0 """ - warn = None - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - - if read_kwargs.get("delim_whitespace"): + if read_kwargs.get("sep"): data = data.replace(",", " ") - warn = FutureWarning elif read_kwargs.get("lineterminator"): data = data.replace("\n", read_kwargs.get("lineterminator")) @@ -60,23 +54,15 @@ def test_line_comment(all_parsers, read_kwargs, request): else: msg = "The 'comment' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - warn, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), **read_kwargs) + parser.read_csv(StringIO(data), **read_kwargs) return elif parser.engine == "python" and read_kwargs.get("lineterminator"): msg = r"Custom line terminators not supported in python parser \(yet\)" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - warn, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), **read_kwargs) + parser.read_csv(StringIO(data), **read_kwargs) return - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), **read_kwargs) - + result = parser.read_csv(StringIO(data), **read_kwargs) expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 85ce55b3bcf83..b7e3a13ec28b8 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -682,7 +682,7 @@ def test_header_missing_rows(all_parsers): parser.read_csv(StringIO(data), header=[0, 1, 2]) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +# ValueError: the 'pyarrow' engine does not support regex separators @xfail_pyarrow def test_header_multiple_whitespaces(all_parsers): # GH#54931 @@ -695,7 +695,7 @@ def test_header_multiple_whitespaces(all_parsers): tm.assert_frame_equal(result, expected) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +# ValueError: the 'pyarrow' engine does not support regex separators @xfail_pyarrow def test_header_delim_whitespace(all_parsers): # GH#54918 @@ -704,12 +704,7 @@ def test_header_delim_whitespace(all_parsers): 1,2 3,4 """ - - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), delim_whitespace=True) + result = parser.read_csv(StringIO(data), sep=r"\s+") expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 7fac67df44ca2..348c19ac0f0c6 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -13,6 +13,7 @@ import pandas as pd from pandas import DataFrame import pandas._testing as tm +from pandas.util.version import Version xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -24,10 +25,16 @@ ] -@xfail_pyarrow # ValueError: Found non-unique column index -def test_multi_thread_string_io_read_csv(all_parsers): +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_multi_thread_string_io_read_csv(all_parsers, request): # see gh-11786 parser = all_parsers + if parser.engine == "pyarrow": + pa = pytest.importorskip("pyarrow") + if Version(pa.__version__) < Version("16.0"): + request.applymarker( + pytest.mark.xfail(reason="# ValueError: Found non-unique column index") + ) max_row_range = 100 num_files = 10 @@ -145,7 +152,8 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe( - parser, path, num_rows, num_tasks - ) - tm.assert_frame_equal(df, final_dataframe) + result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) + + expected = df[:] + expected["date"] = expected["date"].astype("M8[s]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 8968948df5fa9..e9c6c0f5e32d7 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -4,20 +4,16 @@ """ from datetime import ( - date, datetime, timedelta, timezone, ) from io import StringIO -from dateutil.parser import parse as du_parse import numpy as np import pytest import pytz -from pandas._libs.tslibs import parsing - import pandas as pd from pandas import ( DataFrame, @@ -41,414 +37,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow -def test_read_csv_with_custom_date_parser(all_parsers): - # GH36111 - def __custom_date_parser(time): - time = time.astype(np.float64) - time = time.astype(int) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e n h - 41047.00 -98573.7297 871458.0640 389.0089 - 41048.00 -98573.7299 871458.0640 389.0089 - 41049.00 -98573.7300 871458.0642 389.0088 - 41050.00 -98573.7299 871458.0643 389.0088 - 41051.00 -98573.7302 871458.0640 389.0086 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=True, - date_parser=__custom_date_parser, - index_col="time", - ) - time = [41047, 41048, 41049, 41050, 41051] - time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time") - expected = DataFrame( - { - "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302], - "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640], - "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086], - }, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): - # GH44366 - def __custom_date_parser(time): - time = time.astype(np.float64) - time = time.astype(int) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e - 41047.00 -93.77 - 41048.00 -95.79 - 41049.00 -98.73 - 41050.00 -93.99 - 41051.00 -97.72 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=False, - date_parser=__custom_date_parser, - index_col="time", - ) - time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time") - expected = DataFrame( - {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]}, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_separator_date_conflict(all_parsers): - # Regression test for gh-4678 - # - # Make sure thousands separator and - # date parsing do not conflict. - parser = all_parsers - data = "06-02-2013;13:00;1-000.215" - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] - ) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv( - StringIO(data), - sep=";", - thousands="-", - parse_dates={"Date": [0, 1]}, - header=None, - ) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col_custom(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - def date_parser(*date_cols): - """ - Test date parser. - - Parameters - ---------- - date_cols : args - The list of data columns to parse. - - Returns - ------- - parsed : Series - """ - return parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), parser=du_parse - ) - - kwds = { - "header": None, - "date_parser": date_parser, - "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "actual", - "nominal", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("container", [list, tuple, Index, Series]) -@pytest.mark.parametrize("dim", [1, 2]) -def test_concat_date_col_fail(container, dim): - msg = "not all elements from date_cols are numpy arrays" - value = "19990127" - - date_cols = tuple(container([value]) for _ in range(dim)) - - with pytest.raises(ValueError, match=msg): - parsing.concat_date_cols(date_cols) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - - kwds = { - "header": None, - "parse_dates": [[1, 2], [1, 3]], - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), **kwds) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "X1_X2", - "X1_X3", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - tm.assert_frame_equal(result, expected) - - def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -474,6 +62,7 @@ def test_date_col_as_index_col(all_parsers): datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], + dtype="M8[s]", name="X1", ) expected = DataFrame( @@ -495,299 +84,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -def test_multiple_date_cols_int_cast(all_parsers): - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - parse_dates = {"actual": [1, 2], "nominal": [1, 3]} - parser = all_parsers - - kwds = { - "header": None, - "parse_dates": parse_dates, - "date_parser": pd.to_datetime, - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_multiple_date_col_timestamp_parse(all_parsers): - parser = all_parsers - data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 -05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=[[0, 1]], - header=None, - date_parser=Timestamp, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 1, - "E", - 0, - np.nan, - 1306.25, - ], - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 8, - "E", - 0, - np.nan, - 1306.25, - ], - ], - columns=["0_1", 2, 3, 4, 5, 6, 7], - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_with_header(all_parsers): - parser = all_parsers - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,parse_dates,msg", - [ - ( - """\ -date_NominalTime,date,NominalTime -KORD1,19990127, 19:00:00 -KORD2,19990127, 20:00:00""", - [[1, 2]], - ("New date column already in dict date_NominalTime"), - ), - ( - """\ -ID,date,nominalTime -KORD,19990127, 19:00:00 -KORD,19990127, 20:00:00""", - {"ID": [1, 2]}, - "Date column ID already in dict", - ), - ], -) -def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): - parser = all_parsers - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), parse_dates=parse_dates) - - -def test_date_parser_int_bug(all_parsers): - # see gh-3071 - parser = all_parsers - data = ( - "posix_timestamp,elapsed,sys,user,queries,query_time,rows," - "accountid,userid,contactid,level,silo,method\n" - "1343103150,0.062353,0,4,6,0.01690,3," - "12345,1,-1,3,invoice_InvoiceResource,search\n" - ) - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - index_col=0, - parse_dates=[0], - # Note: we must pass tz and then drop the tz attribute - # (if we don't CI will flake out depending on the runner's local time) - date_parser=lambda x: datetime.fromtimestamp(int(x), tz=timezone.utc).replace( - tzinfo=None - ), - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - 0.062353, - 0, - 4, - 6, - 0.01690, - 3, - 12345, - 1, - -1, - 3, - "invoice_InvoiceResource", - "search", - ] - ], - columns=[ - "elapsed", - "sys", - "user", - "queries", - "query_time", - "rows", - "accountid", - "userid", - "contactid", - "level", - "silo", - "method", - ], - index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 @@ -795,7 +91,7 @@ def test_nat_parse(all_parsers): df = DataFrame( { "A": np.arange(10, dtype="float64"), - "B": Timestamp("20010101").as_unit("ns"), + "B": Timestamp("20010101"), } ) df.iloc[3:6, :] = np.nan @@ -807,26 +103,6 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) -@skip_pyarrow -def test_csv_custom_parser(all_parsers): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=lambda x: datetime.strptime(x, "%Y%m%d"), - ) - expected = parser.read_csv(StringIO(data), parse_dates=True) - tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), date_format="%Y%m%d") - tm.assert_frame_equal(result, expected) - - @skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C @@ -851,7 +127,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) + index = date_range("1/1/2009", periods=3, name="date", unit="s")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -859,37 +135,6 @@ def test_parse_dates_string(all_parsers): tm.assert_frame_equal(result, expected) -# Bug in https://github.com/dateutil/dateutil/issues/217 -# has been addressed, but we just don't pass in the `yearfirst` -@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") -@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) -def test_yy_format_with_year_first(all_parsers, parse_dates): - data = """date,time,B,C -090131,0010,1,2 -090228,1020,3,4 -090331,0830,5,6 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - StringIO(data), - index_col=0, - parse_dates=parse_dates, - ) - index = DatetimeIndex( - [ - datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0), - ], - dtype=object, - name="date_time", - ) - expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): @@ -899,6 +144,8 @@ def test_parse_dates_column_list(all_parsers, parse_dates): expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) + expected["a"] = expected["a"].astype("M8[s]") + expected["c"] = expected["c"].astype("M8[s]") expected = expected.set_index(["a", "b"]) result = parser.read_csv( @@ -922,9 +169,10 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers + dti = date_range("2009-01-01", periods=3, freq="D", unit="s") index = MultiIndex.from_product( [ - (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + dti, ("one", "two", "three"), ], names=["index1", "index2"], @@ -959,62 +207,12 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -@pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) -def test_parse_dates_custom_euro_format(all_parsers, kwargs): - parser = all_parsers - data = """foo,bar,baz -31/01/2010,1,2 -01/02/2010,1,NA -02/02/2010,1,2 -""" - if "dayfirst" in kwargs: - df = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - header=0, - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - exp_index = Index( - [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], - name="time", - ) - expected = DataFrame( - {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, - index=exp_index, - columns=["Q", "NTU"], - ) - tm.assert_frame_equal(df, expected) - else: - msg = "got an unexpected keyword argument 'day_first'" - with pytest.raises(TypeError, match=msg): - parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - skiprows=[0], - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - - def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -1026,282 +224,11 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is expected_tz -@xfail_pyarrow -@pytest.mark.parametrize( - "parse_dates,index_col", - [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], -) -def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): - parser = all_parsers - data = """ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD1", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD2", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD3", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD4", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD5", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD6", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - expected = expected.set_index("nominal") - - if not isinstance(parse_dates, dict): - expected.index.name = "date_NominalTime" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates=parse_dates, index_col=index_col - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_chunked(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], - ) - expected = expected.set_index("nominal") - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with parser.read_csv( - StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal", - chunksize=2, - ) as reader: - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -def test_multiple_date_col_named_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_indices = parser.read_csv( - StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" - ) - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_names = parser.read_csv( - StringIO(data), - index_col="nominal", - parse_dates={"nominal": ["date", "nominalTime"]}, - ) - tm.assert_frame_equal(with_indices, with_names) - - -def test_multiple_date_col_multiple_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - - expected = expected.set_index(["nominal", "ID"]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" @@ -1312,15 +239,12 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), parse_dates=(1,)) + parser.read_csv(StringIO(data), parse_dates=parse_dates) @pytest.mark.parametrize("value", ["nan", ""]) @@ -1370,7 +294,6 @@ def test_bad_date_parse_with_warning(all_parsers, cache, value): ) -@xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1380,48 +303,10 @@ def test_parse_dates_empty_string(all_parsers): expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) + expected["Date"] = expected["Date"].astype("M8[s]") tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "reader", ["read_csv_check_warnings", "read_table_check_warnings"] -) -def test_parse_dates_infer_datetime_format_warning(all_parsers, reader): - # GH 49024, 51017 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - - getattr(parser, reader)( - FutureWarning, - "The argument 'infer_datetime_format' is deprecated", - StringIO(data), - parse_dates=["Date"], - infer_datetime_format=True, - sep=",", - raise_on_extra_warnings=False, - ) - - -@pytest.mark.parametrize( - "reader", ["read_csv_check_warnings", "read_table_check_warnings"] -) -def test_parse_dates_date_parser_and_date_format(all_parsers, reader): - # GH 50601 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - msg = "Cannot use both 'date_parser' and 'date_format'" - with pytest.raises(TypeError, match=msg): - getattr(parser, reader)( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=["Date"], - date_parser=pd.to_datetime, - date_format="ISO8601", - sep=",", - ) - - @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", @@ -1429,18 +314,22 @@ def test_parse_dates_date_parser_and_date_format(all_parsers, reader): ( "a\n04.15.2016", {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"], dtype="M8[s]"), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), + DataFrame( + index=DatetimeIndex(["2016-04-15"], dtype="M8[s]", name="a"), columns=[] + ), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], + dtype="M8[s]", + columns=["a", "b"], ), ), ( @@ -1448,7 +337,13 @@ def test_parse_dates_date_parser_and_date_format(all_parsers, reader): {"parse_dates": True, "index_col": [0, 1]}, DataFrame( index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + [ + ( + Timestamp(2016, 4, 15).as_unit("s"), + Timestamp(2013, 9, 16).as_unit("s"), + ) + ], + names=["a", "b"], ), columns=[], ), @@ -1463,279 +358,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test_parse_date_time_multi_level_column_name(all_parsers): - data = """\ -D,T,A,B -date, time,a,b -2001-01-05, 09:00:00, 0.0, 10. -2001-01-06, 00:00:00, 1.0, 11. -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=pd.to_datetime, - ) - - expected_data = [ - [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], - ] - expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """\ -date,time,a,b -2001-01-05, 10:00:00, 0.0, 10. -2001-01-05, 00:00:00, 1., 11. -""", - {"header": 0, "parse_dates": {"date_time": [0, 1]}}, - DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], - [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], - ], - columns=["date_time", "a", "b"], - ), - ), - ( - ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ), - {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}}, - DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - 0.81, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - 0.01, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ), - ), - ], -) -def test_parse_date_time(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=pd.to_datetime, - **kwargs, - raise_on_extra_warnings=False, - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_parse_date_fields(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=lambda x: x, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], - columns=["ymd", "a"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S", None), - ], -) -def test_parse_date_all_fields(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0,0.0,10. -2001,01,5,10,0,00,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S.%f", None), - ], -) -def test_datetime_fractional_seconds(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0.123456,0.0,10. -2001,01,5,10,0,0.500000,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -def test_generic(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - - def parse_function(yy, mm): - return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ym": [0, 1]}, - date_parser=parse_function, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], - columns=["ym", "day", "a"], - ) - expected["ym"] = expected["ym"].astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_date_parser_resolution_if_not_ns(all_parsers): - # see gh-10245 - parser = all_parsers - data = """\ -date,time,prn,rxstatus -2013-11-03,19:00:00,126,00E80000 -2013-11-03,19:00:00,23,00E80000 -2013-11-03,19:00:00,13,00E80000 -""" - - def date_parser(dt, time): - try: - arr = dt + "T" + time - except TypeError: - # dt & time are date/time objects - arr = [datetime.combine(d, t) for d, t in zip(dt, time)] - return np.array(arr, dtype="datetime64[s]") - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"], - ) - - datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") - expected = DataFrame( - data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_arrays( - [datetimes, [126, 23, 13]], - names=["datetime", "prn"], - ), - ) - tm.assert_frame_equal(result, expected) - - def test_parse_date_column_with_empty_string(all_parsers): # see gh-6428 parser = all_parsers @@ -1789,6 +411,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), + unit="s", )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} @@ -1827,7 +450,7 @@ def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" @@ -1860,7 +483,7 @@ def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -1895,11 +518,6 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): [ (None, ["val"], ["date", "time"], "date, time"), (None, ["val"], [0, "time"], "time"), - (None, ["val"], [["date", "time"]], "date, time"), - (None, ["val"], [[0, "time"]], "time"), - (None, ["val"], {"date": [0, "time"]}, "time"), - (None, ["val"], {"date": ["date", "time"]}, "date, time"), - (None, ["val"], [["date", "time"], "date"], "date, time"), (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), ( ["date1", "time1", "temperature"], @@ -1917,20 +535,10 @@ def test_missing_parse_dates_column_raises( content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - warn = FutureWarning - if isinstance(parse_dates, list) and all( - isinstance(x, (int, str)) for x in parse_dates - ): - warn = None - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates - ) + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates + ) @xfail_pyarrow # mismatched shape @@ -1960,40 +568,7 @@ def test_date_parser_multiindex_columns(all_parsers): 1,2 2019-12-31,6""" result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) - expected = DataFrame( - {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow # TypeError: an integer is required -@pytest.mark.parametrize( - "parse_spec, col_name", - [ - ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), - ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), - ], -) -def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): - parser = all_parsers - data = """a,b,c -1,2,3 -2019-12,-31,6""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - parse_dates=parse_spec, - header=[0, 1], - ) - expected = DataFrame( - {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} - ) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) tm.assert_frame_equal(result, expected) @@ -2027,26 +602,7 @@ def test_date_parser_usecols_thousands(all_parsers): thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow # mismatched shape -def test_parse_dates_and_keep_original_column(all_parsers): - # GH#13378 - parser = all_parsers - data = """A -20150908 -20150909 -""" - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True - ) - expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] - expected = DataFrame({"date": expected_data, "A": expected_data}) + expected["C"] = expected["C"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -2056,7 +612,7 @@ def test_dayfirst_warnings(): # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -2117,7 +673,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-01-31"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -2147,14 +703,7 @@ def test_infer_first_column_as_index(all_parsers): @xfail_pyarrow # pyarrow engine doesn't support passing a dict for na_values -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning), - ("date_format", "%Y-%m-%d", None), - ], -) -def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): +def test_replace_nans_before_parsing_dates(all_parsers): # GH#26203 parser = all_parsers data = """Test @@ -2164,13 +713,11 @@ def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): # 2017-09-09 """ - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", + result = parser.read_csv( StringIO(data), na_values={"Test": ["#", "0"]}, parse_dates=["Test"], - **{key: value}, + date_format="%Y-%m-%d", ) expected = DataFrame( { @@ -2181,7 +728,8 @@ def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): pd.NaT, Timestamp("2017-09-09"), ] - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -2196,6 +744,7 @@ def test_parse_dates_and_string_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") + expected["b"] = expected["b"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -2215,7 +764,7 @@ def test_parse_dot_separated_dates(all_parsers): else: expected_index = DatetimeIndex( ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ns]", + dtype="datetime64[ms]", name="a", ) warn = UserWarning @@ -2248,34 +797,8 @@ def test_parse_dates_dict_format(all_parsers): { "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] -) -def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): - # GH#51240 - parser = all_parsers - data = """a,b -31-,12-2019 -31-,12-2020""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates - ) - expected = DataFrame( - { - key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -2308,9 +831,6 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index b62fcc04c375c..45d630c545565 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -4,7 +4,6 @@ engine is set to 'python-fwf' internally. """ -from datetime import datetime from io import ( BytesIO, StringIO, @@ -284,17 +283,6 @@ def test_fwf_regression(): 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 """ - - with tm.assert_produces_warning(FutureWarning, match="use 'date_format' instead"): - result = read_fwf( - StringIO(data), - index_col=0, - header=None, - names=names, - widths=widths, - parse_dates=True, - date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), - ) expected = DataFrame( [ [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], @@ -310,11 +298,11 @@ def test_fwf_regression(): "2009-06-13 20:40:00", "2009-06-13 20:50:00", "2009-06-13 21:00:00", - ] + ], + dtype="M8[us]", ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) - tm.assert_frame_equal(result, expected) result = read_fwf( StringIO(data), index_col=0, @@ -324,6 +312,7 @@ def test_fwf_regression(): parse_dates=True, date_format="%Y%j%H%M%S", ) + expected.index = expected.index.astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -593,10 +582,7 @@ def test_skiprows_inference(): """.strip() skiprows = 2 - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -611,10 +597,7 @@ def test_skiprows_by_index_inference(): """.strip() skiprows = [0, 2] - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 3cd2351f84c7a..99642ee4befc6 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -42,7 +42,9 @@ def test_skip_rows_bug(all_parsers, skiprows): StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -85,7 +87,9 @@ def test_skip_rows_blank(all_parsers): StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -187,7 +191,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported +@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators @pytest.mark.parametrize( "lineterminator", ["\n", "\r\n", "\r"], # "LF" # "CRLF" # "CR" @@ -218,16 +222,12 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request): data = data.replace("\n", lineterminator) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], - ) + result = parser.read_csv( + StringIO(data), + skiprows=1, + sep=r"\s+", + names=["date", "time", "var", "flag", "oflag"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 44a55cf3be240..07f84466e3ac2 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -44,12 +44,7 @@ def test_c_engine(self): data = "a b c\n1 2 3" msg = "does not support" - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - # specify C engine with unsupported options (raise) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="c", sep=r"\s") with pytest.raises(ValueError, match=msg): @@ -58,8 +53,6 @@ def test_c_engine(self): read_csv(StringIO(data), engine="c", skipfooter=1) # specify C-unsupported options without python-unsupported options - with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)): - read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=r"\s") with tm.assert_produces_warning(parsers.ParserWarning): @@ -154,14 +147,8 @@ def test_pyarrow_engine(self): elif default == "on_bad_lines": kwargs[default] = "warn" - warn = None - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if "delim_whitespace" in kwargs: - warn = FutureWarning - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg): - read_csv(StringIO(data), engine="pyarrow", **kwargs) + read_csv(StringIO(data), engine="pyarrow", **kwargs) def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index ab98857e0c178..cc54f2487aa60 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -26,42 +26,6 @@ ) -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - if parser.engine == "pyarrow": - with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 @@ -106,7 +70,7 @@ def test_usecols_with_parse_dates3(all_parsers): parse_dates = [0] cols = { - "a": Timestamp("2016-09-21").as_unit("ns"), + "a": Timestamp("2016-09-21"), "b": [1], "c": [1], "d": [2], @@ -121,75 +85,3 @@ def test_usecols_with_parse_dates3(all_parsers): result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - usecols=usecols, - parse_dates=parse_dates, - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request): - # see gh-9755 - s = """0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0): - mark = pytest.mark.xfail( - reason="Length mismatch in some cases, UserWarning in other" - ) - request.applymarker(mark) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d55066d2d70bb..82b42beb38ae0 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -158,7 +158,8 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@skip_pyarrow # CSV parse error in one case, AttributeError in another +# ArrowKeyError: Column 'a' in include_columns does not exist in CSV file +@skip_pyarrow @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -254,29 +255,12 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow # Column 'a' in include_columns does not exist in CSV file def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) - return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e33ddaf3b81f0..ba108370a4a92 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -317,3 +317,14 @@ def test_read_infer_string(tmp_path, setup_path): columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) + + +def test_hdfstore_read_datetime64_unit_s(tmp_path, setup_path): + # GH 59004 + df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") + path = tmp_path / setup_path + with HDFStore(path, mode="w") as store: + store.put("df_s", df_s) + with HDFStore(path, mode="r") as store: + df_fromstore = store.get("df_s") + tm.assert_frame_equal(df_s, df_fromstore) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 51ee289c8e27a..3ad05cec3bca3 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -236,8 +236,10 @@ def test_table_values_dtypes_roundtrip(setup_path): df1["float322"] = 1.0 df1["float322"] = df1["float322"].astype("float32") df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") + df1["time_s_1"] = Timestamp("20130101") + df1["time_s_2"] = Timestamp("20130101 00:00:00") + df1["time_ms"] = Timestamp("20130101 00:00:00.000") + df1["time_ns"] = Timestamp("20130102 00:00:00.000000000") store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() @@ -252,7 +254,9 @@ def test_table_values_dtypes_roundtrip(setup_path): "int8": 1, "int64": 1, "object": 1, - "datetime64[ns]": 2, + "datetime64[s]": 2, + "datetime64[ms]": 1, + "datetime64[ns]": 1, }, name="count", ) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 471f7b8958ee4..3ce30e313cc30 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -613,10 +613,14 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = DatetimeIndex( - [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], - name="cols\u05d2", - ).tz_localize(tz) + idx = ( + DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], + name="cols\u05d2", + ) + .tz_localize(tz) + .as_unit(unit) + ) idx1 = ( DatetimeIndex( [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 889ef61740a2c..fc5df6d9babcb 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,7 +7,10 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat._constants import ( + IS64, + WASM, +) from pandas.errors import EmptyDataError import pandas as pd @@ -168,6 +171,7 @@ def test_airline(datapath): tm.assert_frame_equal(df, df0) +@pytest.mark.skipif(WASM, reason="Pyodide/WASM has 32-bitness") def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) fname = datapath("io", "sas", "data", "datetime.sas7bdat") @@ -253,6 +257,7 @@ def test_corrupt_read(datapath): pd.read_sas(fname) +@pytest.mark.xfail(WASM, reason="failing with currently set tolerances on WASM") def test_max_sas_date(datapath): # GH 20927 # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 @@ -292,6 +297,7 @@ def test_max_sas_date(datapath): tm.assert_frame_equal(df, expected) +@pytest.mark.xfail(WASM, reason="failing with currently set tolerances on WASM") def test_max_sas_date_iterator(datapath): # GH 20927 # when called as an iterator, only those chunks with a date > pd.Timestamp.max @@ -337,6 +343,7 @@ def test_max_sas_date_iterator(datapath): tm.assert_frame_equal(results[1], expected[1]) +@pytest.mark.skipif(WASM, reason="Pyodide/WASM has 32-bitness") def test_null_date(datapath): fname = datapath("io", "sas", "data", "dates_null.sas7bdat") df = pd.read_sas(fname, encoding="utf-8") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ad729d2346a3b..e4b4d3a82669d 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,7 +19,10 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows +from pandas.compat import ( + WASM, + is_platform_windows, +) import pandas as pd import pandas._testing as tm @@ -163,6 +166,7 @@ def test_iterator(self): tm.assert_frame_equal(first, expected.iloc[[0]]) tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") @pytest.mark.parametrize( "reader, module, error_class, fn_ext", [ @@ -228,6 +232,7 @@ def test_write_missing_parent_directory(self, method, module, error_class, fn_ex ): method(dummy_frame, path) + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") @pytest.mark.parametrize( "reader, module, error_class, fn_ext", [ @@ -382,6 +387,7 @@ def mmap_file(datapath): class TestMMapWrapper: + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_constructor_bad_file(self, mmap_file): non_file = StringIO("I am not a file") non_file.fileno = lambda: -1 @@ -404,6 +410,7 @@ def test_constructor_bad_file(self, mmap_file): with pytest.raises(ValueError, match=msg): icom._maybe_memory_map(target, True) + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_next(self, mmap_file): with open(mmap_file, encoding="utf-8") as target: lines = target.readlines() @@ -587,6 +594,7 @@ def test_bad_encdoing_errors(): icom.get_handle(path, "w", errors="bad") +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_errno_attribute(): # GH 13872 with pytest.raises(FileNotFoundError, match="\\[Errno 2\\]") as err: diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index f6fb032b9d51a..c609ae999d47d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -72,7 +72,9 @@ def test_read_csv(cleared_fs, df1): w.write(text) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_reasonable_error(monkeypatch, cleared_fs): @@ -95,7 +97,9 @@ def test_to_csv(cleared_fs, df1): df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_to_excel(cleared_fs, df1): @@ -106,7 +110,9 @@ def test_to_excel(cleared_fs, df1): df2 = read_excel(path, parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) @pytest.mark.parametrize("binary_mode", [False, True]) @@ -128,7 +134,9 @@ def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): ) assert not fsspec_object.closed - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_csv_options(fsspectest): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4b2be41d0c9f9..17b89c9f31616 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -107,7 +107,11 @@ def from_uri(path): df1.to_markdown(path) df2 = df1 - tm.assert_frame_equal(df1, df2) + expected = df1[:] + if format in ["csv", "excel"]: + expected["dt"] = expected["dt"].dt.as_unit("s") + + tm.assert_frame_equal(df2, expected) def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f16f3a2a5c775..dfc9b4156ecab 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1044,25 +1044,15 @@ def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) - expected = df.to_html() - res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) - tm.assert_frame_equal(df, res[0]) - - def test_parse_dates_combine(self, flavor_read_html): - raw_dates = Series(date_range("1/1/2001", periods=10)) - df = DataFrame( - { - "date": raw_dates.map(lambda x: str(x.date())), - "time": raw_dates.map(lambda x: str(x.time())), - } - ) - res = flavor_read_html( - StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 - ) - newdf = DataFrame({"datetime": raw_dates}) - tm.assert_frame_equal(newdf, res[0]) + + expected = df[:] + expected["date"] = expected["date"].dt.as_unit("s") + + str_df = df.to_html() + res = flavor_read_html(StringIO(str_df), parse_dates=[1], index_col=0) + tm.assert_frame_equal(expected, res[0]) + res = flavor_read_html(StringIO(str_df), parse_dates=["date"], index_col=0) + tm.assert_frame_equal(expected, res[0]) def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index de6d46492e916..c7d9300c0a638 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -321,6 +321,8 @@ def test_orc_dtype_backend_pyarrow(): ], } ) + # FIXME: without casting to ns we do not round-trip correctly + df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]") bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 55be48eb572fd..af492b967bc1d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -655,6 +655,7 @@ def test_read_empty_array(self, pa, dtype): "value": pd.array([], dtype=dtype), } ) + pytest.importorskip("pyarrow", "11.0.0") # GH 45694 expected = None if dtype == "float": @@ -669,8 +670,10 @@ def test_read_empty_array(self, pa, dtype): class TestParquetPyArrow(Base): + @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") def test_basic(self, pa, df_full): df = df_full + pytest.importorskip("pyarrow", "11.0.0") # additional supported types for pyarrow dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels") @@ -704,6 +707,14 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None + if pa_version_under11p0: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ns]" + ) + else: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ms]" + ) tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -938,6 +949,8 @@ def test_timestamp_nanoseconds(self, pa): check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): + pytest.importorskip("pyarrow", "11.0.0") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: request.applymarker( pytest.mark.xfail( @@ -957,7 +970,11 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -968,6 +985,7 @@ def test_filter_row_groups(self, pa): result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 + @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") def test_read_dtype_backend_pyarrow_config(self, pa, df_full): import pyarrow @@ -984,13 +1002,14 @@ def test_read_dtype_backend_pyarrow_config(self, pa, df_full): if pa_version_under13p0: # pyarrow infers datetimes as us instead of ns expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]") - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "timestamp[us][pyarrow]" - ) expected["datetime_tz"] = expected["datetime_tz"].astype( pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "timestamp[ms][pyarrow]" + ) + check_round_trip( df, engine=pa, @@ -1014,6 +1033,7 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + @pytest.mark.xfail(reason="pa.pandas_compat passes 'datetime64' to .astype") def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1103,9 +1123,11 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # df.to_parquet(tmp_path / "test.parquet") # result = read_parquet(tmp_path / "test.parquet") # assert result["strings"].dtype == "string" + # FIXME: don't leave commented-out class TestParquetFastParquet(Base): + @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): df = df_full @@ -1250,6 +1272,25 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) + def test_empty_dataframe(self, fp): + # GH #27339 + df = pd.DataFrame() + expected = df.copy() + check_round_trip(df, fp, expected=expected) + + @pytest.mark.xfail( + reason="fastparquet passed mismatched values/dtype to DatetimeArray " + "constructor, see https://github.com/dask/fastparquet/issues/891" + ) + def test_timezone_aware_index(self, fp, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) + def test_close_file_handle_on_read_error(self): with tm.ensure_clean("test.parquet") as path: pathlib.Path(path).write_bytes(b"breakit") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index af77972d9fd26..df821fb740af8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -19,10 +19,7 @@ import pytest from pandas._libs import lib -from pandas.compat import ( - pa_version_under13p0, - pa_version_under14p1, -) +from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -368,7 +365,7 @@ def create_and_load_postgres_datetz(conn): Timestamp("2000-01-01 08:00:00", tz="UTC"), Timestamp("2000-06-01 07:00:00", tz="UTC"), ] - return Series(expected_data, name="DateColWithTz") + return Series(expected_data, name="DateColWithTz").astype("M8[us, UTC]") def check_iris_frame(frame: DataFrame): @@ -1824,7 +1821,7 @@ def test_api_custom_dateparsing_error( pytest.mark.xfail(reason="failing combination of arguments") ) - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + expected = types_data_frame.astype({"DateCol": "datetime64[s]"}) result = read_sql( text, @@ -1847,10 +1844,12 @@ def test_api_custom_dateparsing_error( } ) - if not pa_version_under13p0: - # TODO: is this astype safe? - expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") - + if conn_name == "postgresql_adbc_types" and pa_version_under14p1: + expected["DateCol"] = expected["DateCol"].astype("datetime64[ns]") + elif "postgres" in conn_name or "mysql" in conn_name: + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + else: + expected["DateCol"] = expected["DateCol"].astype("datetime64[s]") tm.assert_frame_equal(result, expected) @@ -2300,9 +2299,16 @@ def test_api_escaped_table_name(conn, request): def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) - ) + pa = pytest.importorskip("pyarrow") + if not ( + Version(pa.__version__) >= Version("16.0") + and conn in ["sqlite_adbc_conn", "postgresql_adbc_conn"] + ): + request.node.add_marker( + pytest.mark.xfail( + reason="pyarrow->pandas throws ValueError", strict=True + ) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -2828,7 +2834,9 @@ def test_datetime_with_timezone_table(conn, request): conn = request.getfixturevalue(conn) expected = create_and_load_postgres_datetz(conn) result = sql.read_sql_table("datetz", conn) - tm.assert_frame_equal(result, expected.to_frame()) + + exp_frame = expected.to_frame() + tm.assert_frame_equal(result, exp_frame) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2840,7 +2848,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific", unit="us")} ) assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 @@ -2858,7 +2866,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): if "sqlite" in conn_name: # read_sql_query does not return datetime type like read_sql_table assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) + result["A"] = to_datetime(result["A"]).dt.as_unit("us") tm.assert_frame_equal(result, expected) @@ -2869,7 +2877,9 @@ def test_out_of_bounds_datetime(conn, request): data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 result = sql.read_sql_table("test_datetime_obb", conn) - expected = DataFrame([pd.NaT], columns=["date"]) + expected = DataFrame( + np.array([datetime(9999, 1, 1)], dtype="M8[us]"), columns=["date"] + ) tm.assert_frame_equal(result, expected) @@ -2878,7 +2888,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h", unit="us")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") @@ -2930,7 +2940,10 @@ def test_datetime(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) + + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) @@ -2938,9 +2951,7 @@ def test_datetime(conn, request): if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2955,16 +2966,17 @@ def test_datetime_NaT(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) - tm.assert_frame_equal(result, df) + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -3956,6 +3968,7 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): expected = DataFrame( [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 ) + expected["created_dt"] = expected["created_dt"].astype("M8[us, UTC]") tm.assert_frame_equal(result, expected) # Cleanup diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d7fb3c0049965..2534df6a82f89 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -181,9 +181,7 @@ def test_read_dta2(self, datapath): expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") - expected["yearly_date"] = ( - expected["yearly_date"].astype("Period[s]").array.view("M8[s]") - ) + expected["yearly_date"] = expected["yearly_date"].astype("M8[s]") path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") @@ -206,9 +204,9 @@ def test_read_dta2(self, datapath): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) @pytest.mark.parametrize( "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"] @@ -225,11 +223,9 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize( - "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"] - ) - def test_read_dta4(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + @pytest.mark.parametrize("version", [110, 111, 113, 114, 115, 117]) + def test_read_dta4(self, version, datapath): + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -271,11 +267,11 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"]) - def test_readold_dta4(self, file, datapath): + @pytest.mark.parametrize("version", [103, 104, 105, 108]) + def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format - file = datapath("io", "data", "stata", f"{file}.dta") + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -318,8 +314,19 @@ def test_readold_dta4(self, file, datapath): tm.assert_frame_equal(parsed, expected) # File containing strls - def test_read_dta12(self, datapath): - parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) + @pytest.mark.parametrize( + "file", + [ + "stata12_117", + "stata12_be_117", + "stata12_118", + "stata12_be_118", + "stata12_119", + "stata12_be_119", + ], + ) + def test_read_dta_strl(self, file, datapath): + parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], @@ -329,10 +336,20 @@ def test_read_dta12(self, datapath): columns=["x", "y", "z"], ) - tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + tm.assert_frame_equal(parsed, expected, check_dtype=False) - def test_read_dta18(self, datapath): - parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata14_118", + "stata14_be_118", + "stata14_119", + "stata14_be_119", + ], + ) + def test_read_dta118_119(self, file, datapath): + parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( [ @@ -356,7 +373,7 @@ def test_read_dta18(self, datapath): for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) - with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr: + with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr: vl = rdr.variable_labels() vl_expected = { "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", @@ -954,8 +971,8 @@ def test_big_dates(self, datapath, temp_file): parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) - tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) - tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} @@ -967,7 +984,6 @@ def test_big_dates(self, datapath, temp_file): tm.assert_frame_equal( written_and_read_again.set_index("index"), expected.set_index(expected.index.astype(np.int32)), - check_datetimelike_compat=True, ) def test_dtype_conversion(self, datapath): @@ -1254,7 +1270,9 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1346,7 +1364,9 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1658,7 +1678,8 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - def test_writer_117(self, temp_file): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_writer_117(self, byteorder, temp_file): original = DataFrame( data=[ [ @@ -1716,6 +1737,7 @@ def test_writer_117(self, temp_file): original.to_stata( path, convert_dates={"datetime": "tc"}, + byteorder=byteorder, convert_strl=["forced_strl"], version=117, ) @@ -1800,8 +1822,18 @@ def test_gzip_writing(self, temp_file): reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) - def test_unicode_dta_118(self, datapath): - unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata16_118", + "stata16_be_118", + "stata16_119", + "stata16_be_119", + ], + ) + def test_unicode_dta_118_119(self, file, datapath): + unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"] values = [ @@ -1910,7 +1942,8 @@ def test_stata_119(self, datapath): assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) - def test_utf8_writer(self, version, temp_file): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_utf8_writer(self, version, byteorder, temp_file): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = DataFrame( [ @@ -1938,6 +1971,7 @@ def test_utf8_writer(self, version, temp_file): convert_strl=["strls"], variable_labels=variable_labels, write_index=False, + byteorder=byteorder, version=version, value_labels=value_labels, ) @@ -2002,7 +2036,7 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) -@pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) +@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath): data_base = datapath("io", "data", "stata") ref = os.path.join(data_base, "stata-compat-118.dta") @@ -2012,7 +2046,19 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) -@pytest.mark.parametrize("version", [105, 108, 111, 113, 114, 118]) +@pytest.mark.parametrize("version", [103, 104]) +def test_backward_compat_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + data_base = datapath("io", "data", "stata") + ref = os.path.join(data_base, "stata-compat-118.dta") + old = os.path.join(data_base, f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + +@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") @@ -2021,6 +2067,17 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +@pytest.mark.parametrize("version", [103, 104]) +def test_bigendian_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref, convert_dates=False) + big_dta = read_stata(big, convert_dates=False) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta") diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 97599722cb93f..357e6129dd8f1 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,6 +14,7 @@ import numpy as np import pytest +from pandas.compat import WASM from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, @@ -485,6 +486,7 @@ def test_empty_string_etree(val): read_xml(data, parser="etree") +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_wrong_file_path(parser): filename = os.path.join("does", "not", "exist", "books.xml") diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index a85576ff13f5c..96ef50f9d7149 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -378,58 +378,6 @@ def test_parse_dates_true(parser): tm.assert_frame_equal(df_iter, df_expected) -def test_parse_dates_dictionary(parser): - xml = """ - - - square - 360 - 4.0 - 2020 - 12 - 31 - - - circle - 360 - - 2021 - 12 - 31 - - - triangle - 180 - 3.0 - 2022 - 12 - 31 - -""" - - df_result = read_xml( - StringIO(xml), parse_dates={"date_end": ["year", "month", "day"]}, parser=parser - ) - df_iter = read_xml_iterparse( - xml, - parser=parser, - parse_dates={"date_end": ["year", "month", "day"]}, - iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, - ) - - df_expected = DataFrame( - { - "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - tm.assert_frame_equal(df_iter, df_expected) - - def test_day_first_parse_dates(parser): xml = """\ @@ -479,7 +427,5 @@ def test_day_first_parse_dates(parser): def test_wrong_parse_dates_type(xml_books, parser, iterparse): - with pytest.raises( - TypeError, match=("Only booleans, lists, and dictionaries are accepted") - ): + with pytest.raises(TypeError, match="Only booleans and lists are accepted"): read_xml(xml_books, parse_dates={"date"}, parser=parser, iterparse=iterparse) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 5a46cdcb051b6..d8c49d6d47f28 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -76,8 +76,6 @@ def _check_data(xp, rs): xp : matplotlib Axes object rs : matplotlib Axes object """ - import matplotlib.pyplot as plt - xp_lines = xp.get_lines() rs_lines = rs.get_lines() @@ -87,8 +85,6 @@ def _check_data(xp, rs): rsdata = rsl.get_xydata() tm.assert_almost_equal(xpdata, rsdata) - plt.close("all") - def _check_visible(collections, visible=True): """ @@ -495,6 +491,28 @@ def get_y_axis(ax): return ax._shared_axes["y"] +def assert_is_valid_plot_return_object(objs) -> None: + from matplotlib.artist import Artist + from matplotlib.axes import Axes + + if isinstance(objs, (Series, np.ndarray)): + if isinstance(objs, Series): + objs = objs._values + for el in objs.reshape(-1): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {type(el).__name__!r}" + ) + assert isinstance(el, (Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{type(objs).__name__!r}" + ) + assert isinstance(objs, (Artist, tuple, dict)), msg + + def _check_plot_works(f, default_axes=False, **kwargs): """ Create plot and ensure that plot return object is valid. @@ -530,15 +548,11 @@ def _check_plot_works(f, default_axes=False, **kwargs): gen_plots = _gen_two_subplots ret = None - try: - fig = kwargs.get("figure", plt.gcf()) - plt.clf() - - for ret in gen_plots(f, fig, **kwargs): - tm.assert_is_valid_plot_return_object(ret) + fig = kwargs.get("figure", plt.gcf()) + fig.clf() - finally: - plt.close(fig) + for ret in gen_plots(f, fig, **kwargs): + assert_is_valid_plot_return_object(ret) return ret diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index adb56a40b0071..b381c4fce8430 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1120,7 +1120,7 @@ def test_boxplot_return_type_invalid_type(self, return_type): def test_kde_df(self): pytest.importorskip("scipy") - df = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) ax = _check_plot_works(df.plot, kind="kde") expected = [pprint_thing(c) for c in df.columns] _check_legend_labels(ax, labels=expected) @@ -1177,20 +1177,16 @@ def test_hist_df_series(self): _check_ticks_props(axes, xrot=40, yrot=0) def test_hist_df_series_cumulative_density(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) def test_hist_df_series_cumulative(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4) - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-2].get_height(), 10.0) @@ -1385,8 +1381,6 @@ def test_plot_int_columns(self): ], ) def test_style_by_column(self, markers): - import matplotlib.pyplot as plt - fig = plt.gcf() fig.clf() fig.add_subplot(111) @@ -1969,9 +1963,6 @@ def test_sharex_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - - plt.close("all") gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2009,8 +2000,6 @@ def test_sharex_false_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2035,8 +2024,6 @@ def test_sharey_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2073,8 +2060,6 @@ def _check(axes): def test_sharey_and_ax_tight(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2134,9 +2119,6 @@ def test_memory_leak(self, kind): def test_df_gridspec_patterns_vert_horiz(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2149,14 +2131,14 @@ def test_df_gridspec_patterns_vert_horiz(self): ) def _get_vertical_grid(): - gs = gridspec.GridSpec(3, 1) + gs = mpl.gridspec.GridSpec(3, 1) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :]) ax2 = fig.add_subplot(gs[2, :]) return ax1, ax2 def _get_horizontal_grid(): - gs = gridspec.GridSpec(1, 3) + gs = mpl.gridspec.GridSpec(1, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:, :2]) ax2 = fig.add_subplot(gs[:, 2]) @@ -2217,9 +2199,6 @@ def _get_horizontal_grid(): def test_df_gridspec_patterns_boxed(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2227,7 +2206,7 @@ def test_df_gridspec_patterns_boxed(self): # boxed def _get_boxed_grid(): - gs = gridspec.GridSpec(3, 3) + gs = mpl.gridspec.GridSpec(3, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :2]) ax2 = fig.add_subplot(gs[:2, 2]) @@ -2595,8 +2574,6 @@ def test_plot_period_index_makes_no_right_shift(self, freq): def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt - gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) ax_ll = plt.subplot(gs[1, 0]) diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index 76d3b20aaa2c6..4b35e896e1a6c 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -364,14 +364,16 @@ def test_line_colors_and_styles_subplots_list_styles(self): _check_colors(ax.get_lines(), linecolors=[c]) def test_area_colors(self): - from matplotlib.collections import PolyCollection - custom_colors = "rgcby" df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(color=custom_colors) _check_colors(ax.get_lines(), linecolors=custom_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=custom_colors) handles, _ = ax.get_legend_handles_labels() @@ -381,14 +383,15 @@ def test_area_colors(self): assert h.get_alpha() is None def test_area_colors_poly(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(colormap="jet") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=jet_colors) handles, _ = ax.get_legend_handles_labels() @@ -397,15 +400,16 @@ def test_area_colors_poly(self): assert h.get_alpha() is None def test_area_colors_stacked_false(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] # When stacked=False, alpha is set to 0.5 - ax = df.plot.area(colormap=cm.jet, stacked=False) + ax = df.plot.area(colormap=mpl.cm.jet, stacked=False) _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] _check_colors(poly, facecolors=jet_with_alpha) diff --git a/pandas/tests/plotting/frame/test_frame_legend.py b/pandas/tests/plotting/frame/test_frame_legend.py index 402a4b9531e5d..a9723fe4ef871 100644 --- a/pandas/tests/plotting/frame/test_frame_legend.py +++ b/pandas/tests/plotting/frame/test_frame_legend.py @@ -26,9 +26,6 @@ class TestFrameLegend: ) def test_mixed_yerr(self): # https://github.com/pandas-dev/pandas/issues/39522 - from matplotlib.collections import LineCollection - from matplotlib.lines import Line2D - df = DataFrame([{"x": 1, "a": 1, "b": 1}, {"x": 2, "a": 2, "b": 3}]) ax = df.plot("x", "a", c="orange", yerr=0.1, label="orange") @@ -40,8 +37,8 @@ def test_mixed_yerr(self): else: result_handles = legend.legend_handles - assert isinstance(result_handles[0], LineCollection) - assert isinstance(result_handles[1], Line2D) + assert isinstance(result_handles[0], mpl.collections.LineCollection) + assert isinstance(result_handles[1], mpl.lines.Line2D) def test_legend_false(self): # https://github.com/pandas-dev/pandas/issues/40044 diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 573f95eed15ef..4916963ab7c87 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -38,9 +38,7 @@ def _check_ax_limits(col, ax): class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 - import matplotlib.pyplot as plt - - n = 80 + n = 30 df = DataFrame( { "Clinical": np.random.default_rng(2).choice([0, 1, 2, 3], n), @@ -51,10 +49,10 @@ def test_stacked_boxplot_set_axis(self): ) ax = df.plot(kind="bar", stacked=True) assert [int(x.get_text()) for x in ax.get_xticklabels()] == df.index.to_list() - ax.set_xticks(np.arange(0, 80, 10)) + ax.set_xticks(np.arange(0, n, 10)) plt.draw() # Update changes assert [int(x.get_text()) for x in ax.get_xticklabels()] == list( - np.arange(0, 80, 10) + np.arange(0, n, 10) ) @pytest.mark.slow @@ -227,12 +225,12 @@ def test_boxplot_numeric_data(self): # GH 22799 df = DataFrame( { - "a": date_range("2012-01-01", periods=100), - "b": np.random.default_rng(2).standard_normal(100), - "c": np.random.default_rng(2).standard_normal(100) + 2, - "d": date_range("2012-01-01", periods=100).astype(str), - "e": date_range("2012-01-01", periods=100, tz="UTC"), - "f": timedelta_range("1 days", periods=100), + "a": date_range("2012-01-01", periods=10), + "b": np.random.default_rng(2).standard_normal(10), + "c": np.random.default_rng(2).standard_normal(10) + 2, + "d": date_range("2012-01-01", periods=10).astype(str), + "e": date_range("2012-01-01", periods=10, tz="UTC"), + "f": timedelta_range("1 days", periods=10), } ) ax = df.plot(kind="box") @@ -282,8 +280,6 @@ def test_color_kwd(self, colors_kwd, expected): def test_colors_in_theme(self, scheme, expected): # GH: 40769 df = DataFrame(np.random.default_rng(2).random((10, 2))) - import matplotlib.pyplot as plt - plt.style.use(scheme) result = df.plot.box(return_type="dict") for k, v in expected.items(): @@ -334,8 +330,8 @@ def test_plot_xlabel_ylabel(self, vert): def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) - df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) - df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df1 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) @@ -344,7 +340,6 @@ def test_plot_box(self, vert): for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_xlabel_ylabel(self, vert): @@ -374,7 +369,6 @@ def test_boxplot_group_xlabel_ylabel(self, vert): for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_group_no_xlabel_ylabel(self, vert): @@ -389,7 +383,6 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): for subplot in ax: target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() assert target_label == pprint_thing(["group"]) - mpl.pyplot.close() class TestDataFrameGroupByPlots: diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index d4774a5cd0439..cfdfa7f723599 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -34,15 +34,11 @@ Second, ) -try: - from pandas.plotting._matplotlib import converter -except ImportError: - # try / except, rather than skip, to avoid internal refactoring - # causing an improper skip - pass - -pytest.importorskip("matplotlib.pyplot") +plt = pytest.importorskip("matplotlib.pyplot") dates = pytest.importorskip("matplotlib.dates") +units = pytest.importorskip("matplotlib.units") + +from pandas.plotting._matplotlib import converter @pytest.mark.single_cpu @@ -79,30 +75,22 @@ def test_dont_register_by_default(self): assert subprocess.check_call(call) == 0 def test_registering_no_warning(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() ax.plot(s.index, s.values) - plt.close() def test_pandas_plots_register(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run with tm.assert_produces_warning(None) as w: s.plot() - try: - assert len(w) == 0 - finally: - plt.close() + assert len(w) == 0 def test_matplotlib_formatters(self): - units = pytest.importorskip("matplotlib.units") - # Can't make any assertion about the start state. # We we check that toggling converters off removes it, and toggling it # on restores it. @@ -113,8 +101,6 @@ def test_matplotlib_formatters(self): assert Timestamp in units.registry def test_option_no_warning(self): - pytest.importorskip("matplotlib.pyplot") - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() @@ -126,12 +112,8 @@ def test_option_no_warning(self): register_matplotlib_converters() with cf.option_context("plotting.matplotlib.register_converters", False): ax.plot(s.index, s.values) - plt.close() def test_registry_resets(self): - units = pytest.importorskip("matplotlib.units") - dates = pytest.importorskip("matplotlib.dates") - # make a copy, to reset to original = dict(units.registry) @@ -214,7 +196,7 @@ def test_conversion_float(self, dtc): rtol = 0.5 * 10**-9 rs = dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) - xp = converter.mdates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) + xp = dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) tm.assert_almost_equal(rs, xp, rtol=rtol) rs = dtc.convert( @@ -235,10 +217,10 @@ def test_conversion_float(self, dtc): def test_conversion_outofbounds_datetime(self, dtc, values): # 2579 rs = dtc.convert(values, None, None) - xp = converter.mdates.date2num(values) + xp = dates.date2num(values) tm.assert_numpy_array_equal(rs, xp) rs = dtc.convert(values[0], None, None) - xp = converter.mdates.date2num(values[0]) + xp = dates.date2num(values[0]) assert rs == xp @pytest.mark.parametrize( @@ -261,7 +243,7 @@ def test_dateindex_conversion(self, freq, dtc): rtol = 10**-9 dateindex = date_range("2020-01-01", periods=10, freq=freq) rs = dtc.convert(dateindex, None, None) - xp = converter.mdates.date2num(dateindex._mpl_repr()) + xp = dates.date2num(dateindex._mpl_repr()) tm.assert_almost_equal(rs, xp, rtol=rtol) @pytest.mark.parametrize("offset", [Second(), Milli(), Micro(50)]) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 4b4eeada58366..a9135ee583d91 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -46,6 +46,8 @@ mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +import pandas.plotting._matplotlib.converter as conv + class TestTSPlot: @pytest.mark.filterwarnings("ignore::UserWarning") @@ -73,7 +75,7 @@ def test_fontsize_set_correctly(self): def test_frame_inferred(self): # inferred freq - idx = date_range("1/1/1987", freq="MS", periods=100) + idx = date_range("1/1/1987", freq="MS", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame( @@ -82,7 +84,7 @@ def test_frame_inferred(self): _check_plot_works(df.plot) # axes freq - idx = idx[0:40].union(idx[45:99]) + idx = idx[0:4].union(idx[6:]) df2 = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx ) @@ -111,7 +113,6 @@ def test_nonnumeric_exclude(self): fig, ax = mpl.pyplot.subplots() df.plot(ax=ax) # it works assert len(ax.get_lines()) == 1 # B was plotted - mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): idx = date_range("1/1/1987", freq="YE", periods=3) @@ -122,7 +123,7 @@ def test_nonnumeric_exclude_error(self): @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -131,7 +132,7 @@ def test_tsplot_period(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_tsplot_datetime(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -145,10 +146,9 @@ def test_tsplot(self): color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() - def test_both_style_and_color(self): - ts = Series( - np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) - ) + @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=10)]) + def test_both_style_and_color(self, index): + ts = Series(np.arange(10, dtype=np.float64), index=index) msg = ( "Cannot pass 'style' string with a color symbol and 'color' " "keyword argument. Please use one or the other or pass 'style' " @@ -157,46 +157,37 @@ def test_both_style_and_color(self): with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") - s = ts.reset_index(drop=True) - with pytest.raises(ValueError, match=msg): - s.plot(style="b-", color="#000099") - @pytest.mark.parametrize("freq", ["ms", "us"]) def test_high_freq(self, freq): _, ax = mpl.pyplot.subplots() - rng = date_range("1/1/2012", periods=100, freq=freq) + rng = date_range("1/1/2012", periods=10, freq=freq) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): - from pandas.plotting._matplotlib.converter import get_datevalue - - assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "Y") == 1987 - assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal - assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - - def test_ts_plot_format_coord(self): - def check_format_of_first_point(ax, expected_string): - first_line = ax.get_lines()[0] - first_x = first_line.get_xdata()[0].ordinal - first_y = first_line.get_ydata()[0] - assert expected_string == ax.format_coord(first_x, first_y) + assert conv.get_datevalue(None, "D") is None + assert conv.get_datevalue(1987, "Y") == 1987 + assert ( + conv.get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal + ) + assert conv.get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="YE-DEC")) + @pytest.mark.parametrize( + "freq, expected_string", + [["YE-DEC", "t = 2014 y = 1.000000"], ["D", "t = 2014-01-01 y = 1.000000"]], + ) + def test_ts_plot_format_coord(self, freq, expected_string): + ser = Series(1, index=date_range("2014-01-01", periods=3, freq=freq)) _, ax = mpl.pyplot.subplots() - annual.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - - # note this is added to the annual plot already in existence, and - # changes its freq field - daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) - daily.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") + ser.plot(ax=ax) + first_line = ax.get_lines()[0] + first_x = first_line.get_xdata()[0].ordinal + first_y = first_line.get_ydata()[0] + assert expected_string == ax.format_coord(first_x, first_y) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @@ -206,7 +197,7 @@ def test_line_plot_period_series(self, freq): def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the # frequency (`frqncy`) rule code. tests resolution of issue #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) s = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) @@ -214,13 +205,13 @@ def test_line_plot_period_mlt_series(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "QE", "YE"]) def test_line_plot_period_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -235,7 +226,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) # of the frequency (`frqncy`) rule code. tests resolution of issue # #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -249,7 +240,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -263,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) ser = Series(ser.values, Index(np.asarray(ser.index))) _check_plot_works(ser.plot, ser.index.inferred_freq) @@ -350,8 +341,8 @@ def test_business_freq(self): def test_business_freq_convert(self): bts = Series( - np.arange(300, dtype=np.float64), - index=date_range("2020-01-01", periods=300, freq="B"), + np.arange(50, dtype=np.float64), + index=date_range("2020-01-01", periods=50, freq="B"), ).asfreq("BME") ts = bts.to_period("M") _, ax = mpl.pyplot.subplots() @@ -444,12 +435,8 @@ def test_axis_limits(self, obj): result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal - fig = ax.get_figure() - mpl.pyplot.close(fig) def test_get_finder(self): - import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder assert conv.get_finder(to_offset("ME")) == conv._monthly_finder @@ -552,7 +539,7 @@ def test_finder_annual(self): @pytest.mark.slow def test_finder_minutely(self): - nminutes = 50 * 24 * 60 + nminutes = 1 * 24 * 60 rng = date_range("1/1/1999", freq="Min", periods=nminutes) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -577,9 +564,9 @@ def test_finder_hourly(self): def test_gaps(self): ts = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - ts.iloc[5:25] = np.nan + ts.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() ts.plot(ax=ax) lines = ax.get_lines() @@ -591,8 +578,7 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() - mpl.pyplot.close(ax.get_figure()) + assert mask[5:7, 1].all() def test_gaps_irregular(self): # irregular @@ -613,7 +599,6 @@ def test_gaps_irregular(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[2:5, 1].all() - mpl.pyplot.close(ax.get_figure()) def test_gaps_non_ts(self): # non-ts @@ -634,9 +619,9 @@ def test_gaps_non_ts(self): def test_gap_upsample(self): low = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - low.iloc[5:25] = np.nan + low.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() low.plot(ax=ax) @@ -653,7 +638,7 @@ def test_gap_upsample(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() + assert mask[5:7, 1].all() def test_secondary_y(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -667,7 +652,6 @@ def test_secondary_y(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_yaxis(self): Series(np.random.default_rng(2).standard_normal(10)) @@ -675,7 +659,6 @@ def test_secondary_y_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_both(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -701,7 +684,6 @@ def test_secondary_y_ts(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_ts_yaxis(self): idx = date_range("1/1/2000", periods=10) @@ -709,7 +691,6 @@ def test_secondary_y_ts_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_y_ts_visible(self): idx = date_range("1/1/2000", periods=10) @@ -1108,8 +1089,8 @@ def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # high to low @@ -1122,8 +1103,8 @@ def test_mixed_freq_second_millisecond(self): def test_mixed_freq_second_millisecond_low_to_high(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # low to high @@ -1298,7 +1279,6 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_right(self): df = DataFrame( @@ -1315,7 +1295,6 @@ def test_secondary_legend_right(self): assert leg.get_texts()[1].get_text() == "B" assert leg.get_texts()[2].get_text() == "C" assert leg.get_texts()[3].get_text() == "D" - mpl.pyplot.close(fig) def test_secondary_legend_bar(self): df = DataFrame( @@ -1328,7 +1307,6 @@ def test_secondary_legend_bar(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A (right)" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_bar_right(self): df = DataFrame( @@ -1341,7 +1319,6 @@ def test_secondary_legend_bar_right(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_multi_col(self): df = DataFrame( @@ -1366,14 +1343,13 @@ def test_secondary_legend_multi_col(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_nonts(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1387,14 +1363,13 @@ def test_secondary_legend_nonts(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close() def test_secondary_legend_nonts_multi_col(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1448,13 +1423,10 @@ def test_mpl_nopandas(self): exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) - exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) def test_irregular_ts_shared_ax_xlim(self): # GH 2960 - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1467,8 +1439,8 @@ def test_irregular_ts_shared_ax_xlim(self): # check that axis limits are correct left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_secondary_y_non_ts_xlim(self): # GH 3490 - non-timeseries with secondary y @@ -1504,7 +1476,7 @@ def test_secondary_y_regular_ts_xlim(self): def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y - rng = date_range("2000-01-01", periods=10000, freq="min") + rng = date_range("2000-01-01", periods=10, freq="min") ts = Series(1, index=rng) _, ax = mpl.pyplot.subplots() @@ -1519,8 +1491,6 @@ def test_secondary_y_mixed_freq_ts_xlim(self): def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1534,8 +1504,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts_irregular[:5].plot(ax=ax) left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise @@ -1722,35 +1692,28 @@ def test_pickle_fig(self, temp_file, frame_or_series, idx): def _check_plot_works(f, freq=None, series=None, *args, **kwargs): - import matplotlib.pyplot as plt - fig = plt.gcf() - try: - plt.clf() - ax = fig.add_subplot(211) - orig_ax = kwargs.pop("ax", plt.gca()) - orig_axfreq = getattr(orig_ax, "freq", None) - - ret = f(*args, **kwargs) - assert ret is not None # do something more intelligent - - ax = kwargs.pop("ax", plt.gca()) - if series is not None: - dfreq = series.index.freq - if isinstance(dfreq, BaseOffset): - dfreq = dfreq.rule_code - if orig_axfreq is None: - assert ax.freq == dfreq - - if freq is not None: - ax_freq = to_offset(ax.freq, is_period=True) - if freq is not None and orig_axfreq is None: - assert ax_freq == freq - - ax = fig.add_subplot(212) - kwargs["ax"] = ax - ret = f(*args, **kwargs) - assert ret is not None # TODO: do something more intelligent - finally: - plt.close(fig) + fig.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) + + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + + ax = kwargs.pop("ax", plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, BaseOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert ax.freq == dfreq + + if freq is not None and orig_axfreq is None: + assert to_offset(ax.freq, is_period=True) == freq + + ax = fig.add_subplot(212) + kwargs["ax"] = ax + ret = f(*args, **kwargs) + assert ret is not None # TODO: do something more intelligent diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 511c1dd7761d5..65cb62917dc4e 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -27,6 +27,9 @@ ) mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") + +from pandas.plotting._matplotlib.hist import _grouped_hist @pytest.fixture @@ -119,18 +122,13 @@ def test_hist_layout_with_by_shape(self, hist_df): _check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) def test_hist_no_overlap(self): - from matplotlib.pyplot import ( - gcf, - subplot, - ) - x = Series(np.random.default_rng(2).standard_normal(2)) y = Series(np.random.default_rng(2).standard_normal(2)) - subplot(121) + plt.subplot(121) x.hist() - subplot(122) + plt.subplot(122) y.hist() - fig = gcf() + fig = plt.gcf() axes = fig.axes assert len(axes) == 2 @@ -140,10 +138,8 @@ def test_hist_by_no_extra_plots(self, hist_df): assert len(mpl.pyplot.get_fignums()) == 1 def test_plot_fails_when_ax_differs_from_figure(self, ts): - from pylab import figure - - fig1 = figure() - fig2 = figure() + fig1 = plt.figure(1) + fig2 = plt.figure(2) ax1 = fig1.add_subplot(111) msg = "passed axis not bound to passed figure" with pytest.raises(AssertionError, match=msg): @@ -169,8 +165,8 @@ def test_histtype_argument(self, histtype, expected): ) def test_hist_with_legend(self, by, expected_axes_num, expected_layout): # GH 6279 - Series histogram can have a legend - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" # Use default_axes=True when plotting method generate subplots itself @@ -181,8 +177,8 @@ def test_hist_with_legend(self, by, expected_axes_num, expected_layout): @pytest.mark.parametrize("by", [None, "b"]) def test_hist_with_legend_raises(self, by): # GH 6279 - Series histogram with legend and label raises - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" with pytest.raises(ValueError, match="Cannot use both legend and label"): @@ -331,12 +327,10 @@ def test_hist_df_legacy_layout_labelsize_rot(self, frame_or_series): @pytest.mark.slow def test_hist_df_legacy_rectangles(self): - from matplotlib.patches import Rectangle - ser = Series(range(10)) ax = ser.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) @pytest.mark.slow @@ -431,12 +425,12 @@ def test_hist_layout_error(self): # GH 9351 def test_tight_layout(self): - df = DataFrame(np.random.default_rng(2).standard_normal((100, 2))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) df[2] = to_datetime( np.random.default_rng(2).integers( 812419200000000000, 819331200000000000, - size=100, + size=10, dtype=np.int64, ) ) @@ -504,7 +498,7 @@ def test_hist_column_order_unchanged(self, column, expected): def test_histtype_argument(self, histtype, expected): # GH23992 Verify functioning of histtype argument df = DataFrame( - np.random.default_rng(2).integers(1, 10, size=(100, 2)), columns=["a", "b"] + np.random.default_rng(2).integers(1, 10, size=(10, 2)), columns=["a", "b"] ) ax = df.hist(histtype=histtype) _check_patches_all_filled(ax, filled=expected) @@ -519,9 +513,9 @@ def test_hist_with_legend(self, by, column): if by is not None: expected_labels = [expected_labels] * 2 - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -545,9 +539,9 @@ def test_hist_with_legend(self, by, column): @pytest.mark.parametrize("column", [None, "b"]) def test_hist_with_legend_raises(self, by, column): # GH 6279 - DataFrame histogram with legend and label raises - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -586,7 +580,7 @@ def test_hist_df_with_nonnumerics_no_bins(self): def test_hist_secondary_legend(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # primary -> secondary @@ -602,7 +596,7 @@ def test_hist_secondary_legend(self): def test_hist_secondary_secondary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> secondary _, ax = mpl.pyplot.subplots() @@ -617,7 +611,7 @@ def test_hist_secondary_secondary(self): def test_hist_secondary_primary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> primary _, ax = mpl.pyplot.subplots() @@ -632,7 +626,6 @@ def test_hist_secondary_primary(self): def test_hist_with_nans_and_weights(self): # GH 48884 - mpl_patches = pytest.importorskip("matplotlib.patches") df = DataFrame( [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]], columns=list("abc"), @@ -643,12 +636,12 @@ def test_hist_with_nans_and_weights(self): _, ax0 = mpl.pyplot.subplots() df.plot.hist(ax=ax0, weights=weights) - rects = [x for x in ax0.get_children() if isinstance(x, mpl_patches.Rectangle)] + rects = [x for x in ax0.get_children() if isinstance(x, mpl.patches.Rectangle)] heights = [rect.get_height() for rect in rects] _, ax1 = mpl.pyplot.subplots() no_nan_df.plot.hist(ax=ax1, weights=no_nan_weights) no_nan_rects = [ - x for x in ax1.get_children() if isinstance(x, mpl_patches.Rectangle) + x for x in ax1.get_children() if isinstance(x, mpl.patches.Rectangle) ] no_nan_heights = [rect.get_height() for rect in no_nan_rects] assert all(h0 == h1 for h0, h1 in zip(heights, no_nan_heights)) @@ -663,8 +656,6 @@ def test_hist_with_nans_and_weights(self): class TestDataFrameGroupByPlots: def test_grouped_hist_legacy(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(10) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -716,10 +707,6 @@ def test_grouped_hist_legacy_single_key(self): _check_ticks_props(axes, xrot=30) def test_grouped_hist_legacy_grouped_hist_kwargs(self): - from matplotlib.patches import Rectangle - - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -748,14 +735,14 @@ def test_grouped_hist_legacy_grouped_hist_kwargs(self): ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [ + x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle) + ] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) _check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) def test_grouped_hist_legacy_grouped_hist(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -773,8 +760,6 @@ def test_grouped_hist_legacy_grouped_hist(self): _check_ax_scales(axes, yaxis="log") def test_grouped_hist_legacy_external_err(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index d593ddbbaa0b8..43e1255404784 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -31,6 +31,8 @@ plt = pytest.importorskip("matplotlib.pyplot") cm = pytest.importorskip("matplotlib.cm") +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def iris(datapath) -> DataFrame: @@ -109,8 +111,6 @@ def test_savefig(kind, data, index): class TestSeriesPlots: def test_autocorrelation_plot(self): - from pandas.plotting import autocorrelation_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), @@ -118,32 +118,28 @@ def test_autocorrelation_plot(self): ) # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(autocorrelation_plot, series=ser) - _check_plot_works(autocorrelation_plot, series=ser.values) + _check_plot_works(plotting.autocorrelation_plot, series=ser) + _check_plot_works(plotting.autocorrelation_plot, series=ser.values) - ax = autocorrelation_plot(ser, label="Test") + ax = plotting.autocorrelation_plot(ser, label="Test") _check_legend_labels(ax, labels=["Test"]) @pytest.mark.parametrize("kwargs", [{}, {"lag": 5}]) def test_lag_plot(self, kwargs): - from pandas.plotting import lag_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(lag_plot, series=ser, **kwargs) + _check_plot_works(plotting.lag_plot, series=ser, **kwargs) def test_bootstrap_plot(self): - from pandas.plotting import bootstrap_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(bootstrap_plot, series=ser, size=10) + _check_plot_works(plotting.bootstrap_plot, series=ser, size=10) class TestDataFramePlots: @@ -156,7 +152,7 @@ def test_scatter_matrix_axis(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(2).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 3))) # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning, check_stacklevel=False): @@ -168,7 +164,7 @@ def test_scatter_matrix_axis(self, pass_axis): ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ["-2", "0", "2"] + expected = ["-2", "-1", "0"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @@ -181,7 +177,7 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(11).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(11).standard_normal((10, 3))) df[0] = (df[0] - 2) / 3 # we are plotting multiples on a sub-plot @@ -193,18 +189,15 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): ax=ax, ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ["-1.0", "-0.5", "0.0"] + expected = ["-1.25", "-1.0", "-0.75", "-0.5"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow def test_andrews_curves_no_warning(self, iris): - from pandas.plotting import andrews_curves - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(andrews_curves, frame=df, class_column="Name") + _check_plot_works(plotting.andrews_curves, frame=iris, class_column="Name") @pytest.mark.slow @pytest.mark.parametrize( @@ -229,12 +222,10 @@ def test_andrews_curves_no_warning(self, iris): ], ) def test_andrews_curves_linecolors(self, request, df, linecolors): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=linecolors + plotting.andrews_curves, frame=df, class_column="Name", color=linecolors ) _check_colors( ax.get_lines()[:10], linecolors=linecolors, mapping=df["Name"][:10] @@ -256,23 +247,19 @@ def test_andrews_curves_linecolors(self, request, df, linecolors): ], ) def test_andrews_curves_cmap(self, request, df): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=cmaps + plotting.andrews_curves, frame=df, class_column="Name", color=cmaps ) _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_andrews_curves_handle(self): - from pandas.plotting import andrews_curves - colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = andrews_curves(df, "Name", color=colors) + ax = plotting.andrews_curves(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) @@ -282,61 +269,54 @@ def test_andrews_curves_handle(self): [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_parallel_coordinates_colors(self, iris, color): - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", color=color + plotting.parallel_coordinates, frame=df, class_column="Name", color=color ) _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + plotting.parallel_coordinates, + frame=df, + class_column="Name", + colormap=cm.jet, ) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_line_diff(self, iris): - from pandas.plotting import parallel_coordinates - df = iris - ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") + ax = _check_plot_works( + plotting.parallel_coordinates, frame=df, class_column="Name" + ) nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", axvlines=False + plotting.parallel_coordinates, frame=df, class_column="Name", axvlines=False ) assert len(ax.get_lines()) == (nlines - nxticks) @pytest.mark.slow def test_parallel_coordinates_handles(self, iris): - from pandas.plotting import parallel_coordinates - df = iris colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = parallel_coordinates(df, "Name", color=colors) + ax = plotting.parallel_coordinates(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") def test_parallel_coordinates_with_sorted_labels(self): - """For #15908""" - from pandas.plotting import parallel_coordinates - + # GH 15908 df = DataFrame( { "feat": list(range(30)), @@ -345,7 +325,7 @@ def test_parallel_coordinates_with_sorted_labels(self): + [1 for _ in range(10)], } ) - ax = parallel_coordinates(df, "class", sort_labels=True) + ax = plotting.parallel_coordinates(df, "class", sort_labels=True) polylines, labels = ax.get_legend_handles_labels() color_label_tuples = zip( [polyline.get_color() for polyline in polylines], labels @@ -359,45 +339,38 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] def test_radviz_no_warning(self, iris): - from pandas.plotting import radviz - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(radviz, frame=df, class_column="Name") + _check_plot_works(plotting.radviz, frame=iris, class_column="Name") @pytest.mark.parametrize( "color", [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_radviz_color(self, iris, color): - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", color=color) + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", color=color + ) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches[:10], facecolors=color, mapping=df["Name"][:10]) def test_radviz_color_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) def test_radviz_colors_handles(self): - from pandas.plotting import radviz - colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] df = DataFrame( {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} ) - ax = radviz(df, "Name", color=colors) + ax = plotting.radviz(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, facecolors=colors) @@ -471,15 +444,11 @@ def test_get_standard_colors_random_seed(self): def test_get_standard_colors_consistency(self): # GH17525 # Make sure it produces the same colors every time it's called - from pandas.plotting._matplotlib.style import get_standard_colors - color1 = get_standard_colors(1, color_type="random") color2 = get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): - from pandas.plotting._matplotlib.style import get_standard_colors - # Make sure the default color_types returns the specified amount color1 = get_standard_colors(1, color_type="default") color2 = get_standard_colors(9, color_type="default") @@ -509,11 +478,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. - from matplotlib import cm - - from pandas.plotting._matplotlib.style import get_standard_colors - - color_before = cm.gnuplot(range(5)) + color_before = mpl.cm.gnuplot(range(5)) color_after = get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) @@ -521,7 +486,7 @@ def test_get_standard_colors_no_appending(self): np.random.default_rng(2).standard_normal((48, 4)), columns=list("ABCD") ) - color_list = cm.gnuplot(np.linspace(0, 1, 16)) + color_list = mpl.cm.gnuplot(np.linspace(0, 1, 16)) p = df.A.plot.bar(figsize=(16, 7), color=color_list) assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() @@ -546,9 +511,7 @@ def test_dictionary_color(self, kind): def test_bar_plot(self): # GH38947 # Test bar plot with string and int index - from matplotlib.text import Text - - expected = [Text(0, 0, "0"), Text(1, 0, "Total")] + expected = [mpl.text.Text(0, 0, "0"), mpl.text.Text(1, 0, "Total")] df = DataFrame( { @@ -565,11 +528,12 @@ def test_bar_plot(self): def test_barh_plot_labels_mixed_integer_string(self): # GH39126 # Test barh plot with string and integer at the same column - from matplotlib.text import Text - df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledge", "value": 2}]) plot_barh = df.plot.barh(x="word", legend=None) - expected_yticklabels = [Text(0, 0, "1"), Text(0, 1, "knowledge")] + expected_yticklabels = [ + mpl.text.Text(0, 0, "1"), + mpl.text.Text(0, 1, "knowledge"), + ] assert all( actual.get_text() == expected.get_text() for actual, expected in zip( @@ -649,8 +613,8 @@ def test_externally_shared_axes(self): # Create data df = DataFrame( { - "a": np.random.default_rng(2).standard_normal(1000), - "b": np.random.default_rng(2).standard_normal(1000), + "a": np.random.default_rng(2).standard_normal(10), + "b": np.random.default_rng(2).standard_normal(10), } ) @@ -707,9 +671,7 @@ def test_plot_bar_axis_units_timestamp_conversion(self): def test_bar_plt_xaxis_intervalrange(self): # GH 38969 # Ensure IntervalIndex x-axis produces a bar plot as expected - from matplotlib.text import Text - - expected = [Text(0, 0, "([0, 1],)"), Text(1, 0, "([1, 2],)")] + expected = [mpl.text.Text(0, 0, "([0, 1],)"), mpl.text.Text(1, 0, "([1, 2],)")] s = Series( [1, 2], index=[interval_range(0, 2, closed="both")], diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 54f09c7007330..279d9a18d8df7 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -33,9 +33,14 @@ get_y_axis, ) +from pandas.tseries.offsets import CustomBusinessDay + mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +from pandas.plotting._matplotlib.converter import DatetimeConverter +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def ts(): @@ -49,7 +54,7 @@ def ts(): @pytest.fixture def series(): return Series( - range(20), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(20)] + range(10), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(10)] ) @@ -192,28 +197,24 @@ def test_area_sharey_dont_overwrite(self, ts): assert get_y_axis(ax1).joined(ax1, ax2) assert get_y_axis(ax2).joined(ax1, ax2) - plt.close(fig) def test_label(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(label="LABEL", legend=True, ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_none(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=[""]) - mpl.pyplot.close("all") def test_label_ser_name(self): s = Series([1, 2], name="NAME") _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=["NAME"]) - mpl.pyplot.close("all") def test_label_ser_name_override(self): s = Series([1, 2], name="NAME") @@ -221,7 +222,6 @@ def test_label_ser_name_override(self): _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, label="LABEL", ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_ser_name_override_dont_draw(self): s = Series([1, 2], name="NAME") @@ -231,7 +231,6 @@ def test_label_ser_name_override_dont_draw(self): assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_boolean(self): # GH 23719 @@ -344,9 +343,7 @@ def test_rotation_30(self): _check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): - from pandas.plotting._matplotlib.converter import DatetimeConverter - - rng = date_range("1/1/2000", "3/1/2000") + rng = date_range("1/1/2000", "1/15/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -453,9 +450,9 @@ def test_pie_nan(self): def test_df_series_secondary_legend(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # primary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() @@ -467,28 +464,12 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_with_axes(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # primary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left and right axis must be visible - _check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) - assert ax.get_yaxis().get_visible() - assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, ax=ax) @@ -500,29 +481,12 @@ def test_df_series_secondary_legend_both(self): assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # secondary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(secondary_y=True, ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left axis must be invisible and right axis must be visible - expected = ["a (right)", "b (right)", "c (right)", "x (right)"] - _check_legend_labels(ax.left_ax, expected) - assert not ax.left_ax.get_yaxis().get_visible() - assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis_2(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (with passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, mark_right=False, ax=ax) @@ -537,17 +501,12 @@ def test_df_series_secondary_legend_both_with_axis_2(self): @pytest.mark.parametrize( "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] ) - def test_secondary_logy(self, input_logy, expected_scale): - # GH 25545 - s1 = Series(np.random.default_rng(2).standard_normal(100)) - s2 = Series(np.random.default_rng(2).standard_normal(100)) - - # GH 24980 - ax1 = s1.plot(logy=input_logy) - ax2 = s2.plot(secondary_y=True, logy=input_logy) - + @pytest.mark.parametrize("secondary_kwarg", [{}, {"secondary_y": True}]) + def test_secondary_logy(self, input_logy, expected_scale, secondary_kwarg): + # GH 25545, GH 24980 + s1 = Series(np.random.default_rng(2).standard_normal(10)) + ax1 = s1.plot(logy=input_logy, **secondary_kwarg) assert ax1.get_yscale() == expected_scale - assert ax2.get_yscale() == expected_scale def test_plot_fails_with_dupe_color_and_style(self): x = Series(np.random.default_rng(2).standard_normal(2)) @@ -673,6 +632,9 @@ def test_errorbar_asymmetrical(self): expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1) tm.assert_numpy_array_equal(result, expected) + def test_errorbar_asymmetrical_error(self): + # GH9536 + s = Series(np.arange(10), name="x") msg = ( "Asymmetrical error bars should be provided " f"with the shape \\(2, {len(s)}\\)" @@ -759,8 +721,6 @@ def test_series_grid_settings(self): @pytest.mark.parametrize("c", ["r", "red", "green", "#FF0000"]) def test_standard_colors(self, c): - from pandas.plotting._matplotlib.style import get_standard_colors - result = get_standard_colors(1, color=c) assert result == [c] @@ -774,12 +734,8 @@ def test_standard_colors(self, c): assert result == [c] * 3 def test_standard_colors_all(self): - from matplotlib import colors - - from pandas.plotting._matplotlib.style import get_standard_colors - # multiple colors like mediumaquamarine - for c in colors.cnames: + for c in mpl.colors.cnames: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -793,7 +749,7 @@ def test_standard_colors_all(self): assert result == [c] * 3 # single letter colors like k - for c in colors.ColorConverter.colors: + for c in mpl.colors.ColorConverter.colors: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -821,8 +777,6 @@ def test_time_series_plot_color_kwargs(self): _check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_with_empty_kwargs(self): - import matplotlib as mpl - def_colors = _unpack_cycler(mpl.rcParams) index = date_range("1/1/2000", periods=12) s = Series(np.arange(1, 13), index=index) @@ -851,8 +805,6 @@ def test_xtick_barPlot(self): def test_custom_business_day_freq(self): # GH7222 - from pandas.tseries.offsets import CustomBusinessDay - s = Series( range(100, 121), index=pd.bdate_range( diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py index 665bda15724fd..f9c89e0a7893f 100644 --- a/pandas/tests/plotting/test_style.py +++ b/pandas/tests/plotting/test_style.py @@ -2,7 +2,8 @@ from pandas import Series -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") from pandas.plotting._matplotlib.style import get_standard_colors @@ -18,11 +19,8 @@ class TestGetStandardColors: ], ) def test_default_colors_named_from_prop_cycle(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color=["red", "green", "blue"]), + "axes.prop_cycle": plt.cycler(color=["red", "green", "blue"]), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -39,11 +37,8 @@ def test_default_colors_named_from_prop_cycle(self, num_colors, expected): ], ) def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color="bgry"), + "axes.prop_cycle": plt.cycler(color="bgry"), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -74,11 +69,8 @@ def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected) ], ) def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name): - import matplotlib as mpl - import matplotlib.colors as mcolors - with mpl.rc_context(rc={}): - expected = [mcolors.to_hex(x) for x in expected_name] + expected = [mpl.colors.to_hex(x) for x in expected_name] result = get_standard_colors(num_colors=num_colors) assert result == expected diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 3428abacd509e..f4ea6b1d3f3de 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -557,7 +557,8 @@ def test_first_last_skipna(any_real_nullable_dtype, skipna, how): method = getattr(rs, how) result = method(skipna=skipna) - gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + ts = pd.to_datetime("2020-01-31").as_unit("ns") + gb = df.groupby(df.shape[0] * [ts]) expected = getattr(gb, how)(skipna=skipna) expected.index.freq = "ME" tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5ee9b65ba9ae7..7f37ca6831faa 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1300,7 +1300,8 @@ def test_resample_consistency(unit): s10 = s.reindex(index=i10, method="bfill") s10_2 = s.reindex(index=i10, method="bfill", limit=2) - rl = s.reindex_like(s10, method="bfill", limit=2) + with tm.assert_produces_warning(FutureWarning): + rl = s.reindex_like(s10, method="bfill", limit=2) r10_2 = s.resample("10Min").bfill(limit=2) r10 = s.resample("10Min").bfill() @@ -2013,46 +2014,22 @@ def test_resample_empty_series_with_tz(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ], -) -def test_resample_M_Q_Y_deprecated(freq, freq_depr): - # GH#9586 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." +@pytest.mark.parametrize("freq", ["2M", "2m", "2Q", "2Q-SEP", "2q-sep", "1Y", "2Y-MAR"]) +def test_resample_M_Q_Y_raises(freq): + msg = f"Invalid frequency: {freq}" s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = s.resample(freq_depr).mean() - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() -@pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2BME", "2BM"), - ("2BQE", "2BQ"), - ("2BQE-MAR", "2BQ-MAR"), - ], -) -def test_resample_BM_BQ_deprecated(freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." +@pytest.mark.parametrize("freq", ["2BM", "1bm", "1BQ", "2BQ-MAR", "2bq=-mar"]) +def test_resample_BM_BQ_raises(freq): + msg = f"Invalid frequency: {freq}" s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = s.resample(freq_depr).mean() - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() def test_resample_ms_closed_right(unit): diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 67db427a2cdb7..a4e27ad46c59c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -988,30 +988,22 @@ def test_resample_t_l_deprecated(self): ser.resample("T").mean() @pytest.mark.parametrize( - "freq, freq_depr, freq_res, freq_depr_res, data", + "freq, freq_depr, freq_depr_res", [ - ("2Q", "2q", "2Y", "2y", [0.5]), - ("2M", "2m", "2Q", "2q", [1.0, 3.0]), + ("2Q", "2q", "2y"), + ("2M", "2m", "2q"), ], ) - def test_resample_lowercase_frequency_deprecated( - self, freq, freq_depr, freq_res, freq_depr_res, data - ): - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq[1:]}' instead." - depr_msg_res = f"'{freq_depr_res[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_res[1:]}' instead." - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - rng_l = period_range("2020-01-01", "2020-08-01", freq=freq_depr) - ser = Series(np.arange(len(rng_l)), index=rng_l) - - rng = period_range("2020-01-01", "2020-08-01", freq=freq_res) - expected = Series(data=data, index=rng) + def test_resample_lowercase_frequency_raises(self, freq, freq_depr, freq_depr_res): + msg = f"Invalid frequency: {freq_depr}" + with pytest.raises(ValueError, match=msg): + period_range("2020-01-01", "2020-08-01", freq=freq_depr) - with tm.assert_produces_warning(FutureWarning, match=depr_msg_res): - result = ser.resample(freq_depr_res).mean() - tm.assert_series_equal(result, expected) + msg = f"Invalid frequency: {freq_depr_res}" + rng = period_range("2020-01-01", "2020-08-01", freq=freq) + ser = Series(np.arange(len(rng)), index=rng) + with pytest.raises(ValueError, match=msg): + ser.resample(freq_depr_res).mean() @pytest.mark.parametrize( "offset", @@ -1031,25 +1023,26 @@ def test_asfreq_invalid_period_offset(self, offset, frame_or_series): @pytest.mark.parametrize( - "freq,freq_depr", + "freq", [ - ("2M", "2ME"), - ("2Q", "2QE"), - ("2Q-FEB", "2QE-FEB"), - ("2Y", "2YE"), - ("2Y-MAR", "2YE-MAR"), - ("2M", "2me"), - ("2Q", "2qe"), - ("2Y-MAR", "2ye-mar"), + ("2ME"), + ("2QE"), + ("2QE-FEB"), + ("2YE"), + ("2YE-MAR"), + ("2me"), + ("2qe"), + ("2ye-mar"), ], ) -def test_resample_frequency_ME_QE_YE_error_message(frame_or_series, freq, freq_depr): +def test_resample_frequency_ME_QE_YE_raises(frame_or_series, freq): # GH#9586 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"{freq[1:]} is not supported as period frequency" obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) + msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) def test_corner_cases_period(simple_period_range_series): @@ -1062,20 +1055,11 @@ def test_corner_cases_period(simple_period_range_series): assert len(result) == 0 -@pytest.mark.parametrize( - "freq_depr", - [ - "2BME", - "2CBME", - "2SME", - "2BQE-FEB", - "2BYE-MAR", - ], -) -def test_resample_frequency_invalid_freq(frame_or_series, freq_depr): +@pytest.mark.parametrize("freq", ["2BME", "2CBME", "2SME", "2BQE-FEB", "2BYE-MAR"]) +def test_resample_frequency_invalid_freq(frame_or_series, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index a77097fd5ce61..bf1f6bd34b171 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -328,7 +328,7 @@ def test_agg_consistency(): r = df.resample("3min") - msg = r"Column\(s\) \['r1', 'r2'\] do not exist" + msg = r"Label\(s\) \['r1', 'r2'\] do not exist" with pytest.raises(KeyError, match=msg): r.agg({"r1": "mean", "r2": "sum"}) @@ -343,7 +343,7 @@ def test_agg_consistency_int_str_column_mix(): r = df.resample("3min") - msg = r"Column\(s\) \[2, 'b'\] do not exist" + msg = r"Label\(s\) \[2, 'b'\] do not exist" with pytest.raises(KeyError, match=msg): r.agg({2: "mean", "b": "sum"}) @@ -534,7 +534,7 @@ def test_agg_with_lambda(cases, agg): ], ) def test_agg_no_column(cases, agg): - msg = r"Column\(s\) \['result1', 'result2'\] do not exist" + msg = r"Label\(s\) \['result1', 'result2'\] do not exist" with pytest.raises(KeyError, match=msg): cases[["A", "B"]].agg(**agg) @@ -582,7 +582,7 @@ def test_agg_specificationerror_series(cases, agg): def test_agg_specificationerror_invalid_names(cases): # errors # invalid names in the agg specification - msg = r"Column\(s\) \['B'\] do not exist" + msg = r"Label\(s\) \['B'\] do not exist" with pytest.raises(KeyError, match=msg): cases[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) @@ -631,7 +631,7 @@ def test_try_aggregate_non_existing_column(): df = DataFrame(data).set_index("dt") # Error as we don't have 'z' column - msg = r"Column\(s\) \['z'\] do not exist" + msg = r"Label\(s\) \['z'\] do not exist" with pytest.raises(KeyError, match=msg): df.resample("30min").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 5f5a54c4d92a3..2646106b9b97c 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -421,11 +421,13 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) ) volume = [50, 50, 60] - week_starting = [ - Timestamp("2018-01-07"), - Timestamp("2018-01-18 01:00:00"), - Timestamp("2018-01-14"), - ] + week_starting = pd.DatetimeIndex( + [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), + ] + ).as_unit("ns") expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], names=["volume", "week_starting"], diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index c831cb8293943..afafe8f6ab264 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -19,12 +19,12 @@ "float64": [1.1, np.nan, 3.3], "category": Categorical(["X", "Y", "Z"]), "object": ["a", "b", "c"], - "datetime64[ns]": [ + "datetime64[s]": [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-03"), ], - "datetime64[ns, US/Eastern]": [ + "datetime64[s, US/Eastern]": [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), pd.Timestamp("2011-01-03", tz="US/Eastern"), diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index f86cc0c69d363..550b424371a95 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,6 +5,7 @@ from collections.abc import Iterator from datetime import datetime from decimal import Decimal +import itertools import numpy as np import pytest @@ -51,35 +52,39 @@ def test_concat_copy(self): # These are actual copies. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: - assert arr.base is not None + for block in result._mgr.blocks: + assert block.values.base is not None # These are the same. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: + for block in result._mgr.blocks: + arr = block.values if arr.dtype.kind == "f": - assert arr.base is df._mgr.arrays[0].base + assert arr.base is df._mgr.blocks[0].values.base elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) result = concat([df, df2, df3, df4], axis=1) - for arr in result._mgr.arrays: + for blocks in result._mgr.blocks: + arr = blocks.values if arr.dtype.kind == "f": # this is a view on some array in either df or df4 assert any( - np.shares_memory(arr, other) - for other in df._mgr.arrays + df4._mgr.arrays + np.shares_memory(arr, block.values) + for block in itertools.chain(df._mgr.blocks, df4._mgr.blocks) ) elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: # this is a view on df3 - assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays) + assert any( + np.shares_memory(arr, block.values) for block in df3._mgr.blocks + ) def test_concat_with_group_keys(self): # axis=0 diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 3e046b2df72d8..89a3c3c5ed8bc 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -213,7 +213,7 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) + @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101").as_unit("ns")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): # GH 12396 @@ -358,7 +358,7 @@ def test_concat_tz_series_tzlocal(self): result = concat([Series(x), Series(y)], ignore_index=True) tm.assert_series_equal(result, Series(x + y)) - assert result.dtype == "datetime64[ns, tzlocal()]" + assert result.dtype == "datetime64[s, tzlocal()]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5c5c06dea0008..0a5989e3c82e6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1451,8 +1451,8 @@ def test_merge_readonly(self): ) # make each underlying block array / column array read-only - for arr in data1._mgr.arrays: - arr.flags.writeable = False + for block in data1._mgr.blocks: + block.values.flags.writeable = False data1.merge(data2) # no error diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 340c5c449aea7..d8bb4fba1e1fe 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -445,10 +447,16 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CategoricalDtype(ordered=True)) + ) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) + + if type(bins[0]) is datetime: + # The bins have microsecond dtype -> so does result + expected = expected.astype("interval[datetime64[us]]") + + expected = expected.astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -461,10 +469,6 @@ def test_datetime_cut(unit, box): data = box(data) result, _ = cut(data, 3, retbins=True) - if box is list: - # We don't (yet) do inference on these, so get nanos - unit = "ns" - if unit == "s": # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 # for why we round to 8 seconds instead of 7 @@ -531,24 +535,26 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(ser, bins) - expected = Series( - IntervalIndex( - [ - Interval( - Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz), - ), - ] - ) - ).astype(CategoricalDtype(ordered=True)) + ii = IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + if isinstance(bins, int): + # the dtype is inferred from ser, which has nanosecond unit + ii = ii.astype("interval[datetime64[ns, US/Eastern]]") + expected = Series(ii).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index f224a45ca3279..49200face66c5 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -533,6 +533,26 @@ def test_melt_non_scalar_var_name_raises(self): with pytest.raises(ValueError, match=r".* must be a scalar."): df.melt(id_vars=["a"], var_name=[1, 2]) + def test_melt_multiindex_columns_var_name(self): + # GH 58033 + df = DataFrame({("A", "a"): [1], ("A", "b"): [2]}) + + expected = DataFrame( + [("A", "a", 1), ("A", "b", 2)], columns=["first", "second", "value"] + ) + + tm.assert_frame_equal(df.melt(var_name=["first", "second"]), expected) + tm.assert_frame_equal(df.melt(var_name=["first"]), expected[["first", "value"]]) + + def test_melt_multiindex_columns_var_name_too_many(self): + # GH 58033 + df = DataFrame({("A", "a"): [1], ("A", "b"): [2]}) + + with pytest.raises( + ValueError, match="but the dataframe columns only have 2 levels" + ): + df.melt(var_name=["first", "second", "third"]) + class TestLreshape: def test_pairs(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 97f06b0e379f4..728becc76b71f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2058,6 +2058,60 @@ def test_pivot_string_as_func(self): ).rename_axis("A") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("kwargs", [{"a": 2}, {"a": 2, "b": 3}, {"b": 3, "a": 2}]) + def test_pivot_table_kwargs(self, kwargs): + # GH#57884 + def f(x, a, b=3): + return x.sum() * a + b + + def g(x): + return f(x, **kwargs) + + df = DataFrame( + { + "A": ["good", "bad", "good", "bad", "good"], + "B": ["one", "two", "one", "three", "two"], + "X": [2, 5, 4, 20, 10], + } + ) + result = pivot_table( + df, index="A", columns="B", values="X", aggfunc=f, **kwargs + ) + expected = pivot_table(df, index="A", columns="B", values="X", aggfunc=g) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs", [{}, {"b": 10}, {"a": 3}, {"a": 3, "b": 10}, {"b": 10, "a": 3}] + ) + def test_pivot_table_kwargs_margin(self, data, kwargs): + # GH#57884 + def f(x, a=5, b=7): + return (x.sum() + b) * a + + def g(x): + return f(x, **kwargs) + + result = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=f, + margins=True, + fill_value=0, + **kwargs, + ) + + expected = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=g, + margins=True, + fill_value=0, + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "f, f_numpy", [ @@ -2705,7 +2759,7 @@ def test_pivot_table_with_margins_and_numeric_column_names(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("m", [1, 10]) - def test_unstack_shares_memory(self, m): + def test_unstack_copy(self, m): # GH#56633 levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) @@ -2713,6 +2767,5 @@ def test_unstack_shares_memory(self, m): df = DataFrame(values, index, np.arange(100)) df_orig = df.copy() result = df.unstack(sort=False) - assert np.shares_memory(df._values, result._values) is (m == 1) result.iloc[0, 0] = -1 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 53af673e0f7b0..5f769db7f8acf 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -271,8 +271,10 @@ def test_datetime_tz_qcut(bins): ], ], ) -def test_date_like_qcut_bins(arg, expected_bins): +def test_date_like_qcut_bins(arg, expected_bins, unit): # see gh-19891 + arg = arg.as_unit(unit) + expected_bins = expected_bins.as_unit(unit) ser = Series(arg) result, result_bins = qcut(ser, 2, retbins=True) tm.assert_index_equal(result_bins, expected_bins) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 1a21d234f1d50..90d4a7d0cc23b 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -59,6 +59,7 @@ def test_asfreq_corner(self): def test_conv_annual(self): # frequency conversion tests: from Annual Frequency + msg = INVALID_FREQ_ERR_MSG ival_A = Period(freq="Y", year=2007) @@ -110,18 +111,17 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - msg = "'H' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): + msg_depr = "'H' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg_depr): assert ival_A.asfreq("H", "s") == ival_A_to_H_start assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end - msg = "Invalid frequency: T" with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("T", "s") == ival_A_to_T_start assert ival_A.asfreq("T", "E") == ival_A_to_T_end - msg = "'S' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): + msg_depr = "'S' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg_depr): assert ival_A.asfreq("S", "S") == ival_A_to_S_start assert ival_A.asfreq("S", "E") == ival_A_to_S_end @@ -820,7 +820,7 @@ def test_asfreq_MS(self): assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = "MS is not supported as period frequency" + msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 2c3a0816737fc..49bd48b40e67a 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -60,7 +60,7 @@ def test_invalid_frequency_error_message(self): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): - msg = "for Period, please use 'M' instead of 'ME'" + msg = "Invalid frequency: ME" with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="ME") diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e352e2601cef3..131be7a77f2e5 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -439,8 +439,10 @@ def test_nat_rfloordiv_timedelta(val, expected): @pytest.mark.parametrize( "value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], dtype="M8[ns]", name="x"), + DatetimeIndex( + ["2011-01-01", "2011-01-02"], dtype="M8[ns, US/Eastern]", name="x" + ), DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], dtype="M8[ns]"), DatetimeArray._from_sequence( ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") diff --git a/pandas/tests/scalar/timestamp/methods/test_replace.py b/pandas/tests/scalar/timestamp/methods/test_replace.py index 62f9ecc9ccf2c..5d511947ffdbf 100644 --- a/pandas/tests/scalar/timestamp/methods/test_replace.py +++ b/pandas/tests/scalar/timestamp/methods/test_replace.py @@ -11,6 +11,7 @@ conversion, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.compat import WASM import pandas.util._test_decorators as td import pandas._testing as tm @@ -99,6 +100,7 @@ def test_replace_integer_args(self, tz_aware_fixture): with pytest.raises(ValueError, match=msg): ts.replace(hour=0.1) + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_replace_tzinfo_equiv_tz_localize_none(self): # GH#14621, GH#7825 # assert conversion to naive is the same as replacing tzinfo with None @@ -106,6 +108,7 @@ def test_replace_tzinfo_equiv_tz_localize_none(self): assert ts.tz_localize(None) == ts.replace(tzinfo=None) @td.skip_if_windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_replace_tzinfo(self): # GH#15683 dt = datetime(2016, 3, 27, 1) diff --git a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py index 67985bd4ba566..b576317fca8b4 100644 --- a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py +++ b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py @@ -1,9 +1,11 @@ # NB: This is for the Timestamp.timestamp *method* specifically, not # the Timestamp class in general. +import pytest from pytz import utc from pandas._libs.tslibs import Timestamp +from pandas.compat import WASM import pandas.util._test_decorators as td import pandas._testing as tm @@ -11,6 +13,7 @@ class TestTimestampMethod: @td.skip_if_windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_timestamp(self, fixed_now_ts): # GH#17329 # tz-naive --> treat it as if it were UTC for purposes of timestamp() diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index e1299c272e5cc..44db1187850c9 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -5,6 +5,8 @@ import pytest import pytz # a test below uses pytz but only inside a `eval` call +from pandas.compat import WASM + from pandas import Timestamp ts_no_ns = Timestamp( @@ -95,6 +97,7 @@ class TestTimestampRendering: @pytest.mark.parametrize( "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"] ) + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_repr(self, date, freq, tz): # avoid to match with timezone name freq_repr = f"'{freq}'" diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 8c60f7beb317d..49ae0a60e6608 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -256,9 +256,8 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - ser = Series(idx) + idx = period_range("20130101", periods=5, freq="D", name="xxx") + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index b94e6b6f0c6c8..69fba8925784e 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -461,9 +461,9 @@ def test_dt64tz_setitem_does_not_mutate_dti(self): ser = Series(dti) assert ser._values is not dti assert ser._values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base + assert ser._mgr.blocks[0].values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0] is not dti + assert ser._mgr.blocks[0].values is not dti ser[::3] = NaT assert ser[0] is NaT diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 0f2f533c8feff..293919173c2d5 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -78,7 +78,7 @@ def test_combine_first_dt64(self, unit): s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") + xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]") tm.assert_series_equal(rs, xp) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 875ffdd3fe851..b94723b7cbddf 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -82,15 +82,13 @@ def test_equals_matching_nas(): left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 592dba253532d..c10bb8278a3d1 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -411,7 +411,7 @@ def test_datetime64_tz_fillna(self, tz, unit): Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), - ] + ], ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index f25583904377a..4a11d7905f506 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -1,5 +1,4 @@ from pandas import ( - Index, Series, date_range, ) @@ -19,7 +18,7 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series([1, "2", 3.0]) result = obj._get_numeric_data() - expected = Series([], dtype=object, index=Index([], dtype=object)) + expected = Series([], dtype=object) tm.assert_series_equal(result, expected) obj = Series([True, False, True]) @@ -28,5 +27,5 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series(date_range("20130101", periods=3)) result = obj._get_numeric_data() - expected = Series([], dtype="M8[ns]", index=Index([], dtype=object)) + expected = Series([], dtype="M8[ns]") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex_like.py b/pandas/tests/series/methods/test_reindex_like.py index 7f24c778feb1b..10b8ac5817636 100644 --- a/pandas/tests/series/methods/test_reindex_like.py +++ b/pandas/tests/series/methods/test_reindex_like.py @@ -20,7 +20,8 @@ def test_reindex_like(datetime_series): series1 = Series([5, None, None], [day1, day2, day3]) series2 = Series([None, None], [day1, day3]) - result = series1.reindex_like(series2, method="pad") + with tm.assert_produces_warning(FutureWarning): + result = series1.reindex_like(series2, method="pad") expected = Series([5, np.nan], index=[day1, day3]) tm.assert_series_equal(result, expected) @@ -32,10 +33,13 @@ def test_reindex_like_nearest(): other = ser.reindex(target, method="nearest") expected = Series(np.around(target).astype("int64"), target) - result = ser.reindex_like(other, method="nearest") + with tm.assert_produces_warning(FutureWarning): + result = ser.reindex_like(other, method="nearest") tm.assert_series_equal(expected, result) - result = ser.reindex_like(other, method="nearest", tolerance=1) + with tm.assert_produces_warning(FutureWarning): + result = ser.reindex_like(other, method="nearest", tolerance=1) tm.assert_series_equal(expected, result) - result = ser.reindex_like(other, method="nearest", tolerance=[1, 2, 3, 4]) + with tm.assert_produces_warning(FutureWarning): + result = ser.reindex_like(other, method="nearest", tolerance=[1, 2, 3, 4]) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index f7dec02ab0e5b..488d0cb9fe9da 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -31,7 +31,9 @@ def test_from_csv(self, datetime_series, string_series, temp_file): path = temp_file datetime_series.to_csv(path, header=False) ts = self.read_csv(path, parse_dates=True) - tm.assert_series_equal(datetime_series, ts, check_names=False) + expected = datetime_series.copy() + expected.index = expected.index.as_unit("s") + tm.assert_series_equal(expected, ts, check_names=False) assert ts.name is None assert ts.index.name is None @@ -57,6 +59,7 @@ def test_from_csv(self, datetime_series, string_series, temp_file): series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + check_series.index = check_series.index.as_unit("s") tm.assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3f9d5bbe806bb..44a7862c21273 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -752,7 +752,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([NaT, NaT]) - assert exp.dtype == "datetime64[ns]" + assert exp.dtype == "datetime64[s]" tm.assert_series_equal(Series([NaT, NaT]), exp) tm.assert_series_equal(Series(np.array([NaT, NaT])), exp) @@ -934,7 +934,7 @@ def test_constructor_datetimes_with_nulls(self): np.array([None, None, datetime.now(), None]), ]: result = Series(arr) - assert result.dtype == "M8[ns]" + assert result.dtype == "M8[us]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) @@ -962,15 +962,15 @@ def test_constructor_dtype_datetime64_10(self): dates = [np.datetime64(x) for x in pydates] ser = Series(dates) - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" ser.iloc[0] = np.nan - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") @@ -1084,16 +1084,16 @@ def test_constructor_dtype_datetime64_4(self): def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert ser.dtype == "object" + assert ser.dtype == "M8[us]" assert ser[2] is NaT assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert ser.dtype == "object" - assert ser[2] is np.nan - assert "NaN" in str(ser) + assert ser.dtype == "M8[us]" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1155,7 +1155,7 @@ def test_constructor_with_datetime_tz4(self): Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ] ) - assert ser.dtype == "datetime64[ns, US/Pacific]" + assert ser.dtype == "datetime64[s, US/Pacific]" assert lib.infer_dtype(ser, skipna=True) == "datetime64" def test_constructor_with_datetime_tz3(self): @@ -1215,7 +1215,7 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, kind, unit): def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") - expected = Series(Timestamp(arg)).dt.tz_localize("CET") + expected = Series([Timestamp(arg)], dtype="M8[ns]").dt.tz_localize("CET") tm.assert_series_equal(result, expected) def test_constructor_datetime64_bigendian(self): @@ -1318,9 +1318,8 @@ def test_constructor_periodindex(self): pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - expected = Series(pi.astype(object)) - tm.assert_series_equal(s, expected) + expected = Series(pi.astype(object)) + assert expected.dtype == object def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} @@ -1356,14 +1355,8 @@ def test_constructor_dict_order(self): expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) - def test_constructor_dict_extension(self, ea_scalar_and_dtype, request): + def test_constructor_dict_extension(self, ea_scalar_and_dtype): ea_scalar, ea_dtype = ea_scalar_and_dtype - if isinstance(ea_scalar, Timestamp): - mark = pytest.mark.xfail( - reason="Construction from dict goes through " - "maybe_convert_objects which casts to nano" - ) - request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1408,7 +1401,9 @@ def create_data(constructor): result_Timestamp = Series(data_Timestamp) tm.assert_series_equal(result_datetime64, expected) - tm.assert_series_equal(result_datetime, expected) + tm.assert_series_equal( + result_datetime, expected.set_axis(expected.index.as_unit("us")) + ) tm.assert_series_equal(result_Timestamp, expected) def test_constructor_dict_tuple_indexer(self): @@ -2141,20 +2136,14 @@ def test_series_string_inference_na_first(self): result = Series([pd.NA, "b"]) tm.assert_series_equal(result, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - ser = Series([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(None): - # This doesn't do inference - result = Series(ser) + obj = klass([Timestamp("2019-12-31")], dtype=object) + # This doesn't do inference + result = Series(obj) assert result.dtype == np.object_ - idx = Index([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Series(idx) - assert result.dtype != np.object_ - class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index ff8c6a98e1819..2511474e03ff7 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -1,3 +1,5 @@ +import weakref + import numpy as np import pytest @@ -68,6 +70,15 @@ def test_api(any_string_dtype): assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods) +def test_no_circular_reference(any_string_dtype): + # GH 47667 + ser = Series([""], dtype=any_string_dtype) + ref = weakref.ref(ser) + ser.str # Used to cache and cause circular reference + del ser + assert ref() is None + + def test_api_mi_raises(): # GH 23679 mi = MultiIndex.from_arrays([["a", "b", "c"]]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 1b5d33fc10595..134ebededd163 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -990,21 +990,18 @@ def test_isin_datetimelike_all_nat(self, dtype): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"]) - def test_isin_datetimelike_strings_deprecated(self, dtype): + def test_isin_datetimelike_strings_returns_false(self, dtype): # GH#53111 dta = date_range("2013-01-01", periods=3)._values arr = Series(dta.view("i8")).array.view(dtype) vals = [str(x) for x in arr] - msg = "The behavior of 'isin' with dtype=.* is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = algos.isin(arr, vals) - assert res.all() + res = algos.isin(arr, vals) + assert not res.any() vals2 = np.array(vals, dtype=str) - with tm.assert_produces_warning(FutureWarning, match=msg): - res2 = algos.isin(arr, vals2) - assert res2.all() + res2 = algos.isin(arr, vals2) + assert not res2.any() def test_isin_dt64tz_with_nat(self): # the all-NaT values used to get inferred to tznaive, which was evaluated @@ -1267,6 +1264,7 @@ def test_value_counts_datetime_outofbounds(self, dtype): ], dtype=dtype, ) + res = ser.value_counts() exp_index = Index( diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 97e0fa93c90ef..8f661edf0f241 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -121,8 +121,7 @@ def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_dat expected = ymd.groupby([k1, k2]).mean() - # TODO groupby with level_values drops names - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) assert result.index.names == ymd.index.names[:2] result2 = ymd.groupby(level=ymd.index.names[:2]).mean() diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index 4e569dc40005d..9deff56139394 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -1,5 +1,6 @@ from collections.abc import Generator import contextlib +import weakref import pytest @@ -101,3 +102,22 @@ def __init__(self, data) -> None: with pytest.raises(AttributeError, match="whoops"): pd.Series([], dtype=object).bad + + +@pytest.mark.parametrize( + "klass, registrar", + [ + (pd.Series, pd.api.extensions.register_series_accessor), + (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), + (pd.Index, pd.api.extensions.register_index_accessor), + ], +) +def test_no_circular_reference(klass, registrar): + # GH 41357 + with ensure_removed(klass, "access"): + registrar("access")(MyAccessor) + obj = klass([0]) + ref = weakref.ref(obj) + assert obj.access.obj is obj + del obj + assert ref() is None diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 132608d7df115..56de3f7f39175 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -104,7 +104,9 @@ def test_int64_overflow_groupby_large_df_shuffled(self, agg): gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! - assert is_int64_overflow_possible(gr._grouper.shape) + assert is_int64_overflow_possible( + tuple(ping.ngroups for ping in gr._grouper.groupings) + ) mi = MultiIndex.from_arrays( [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f4042acd05dc3..cbbd018720bad 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -21,6 +21,7 @@ iNaT, parsing, ) +from pandas.compat import WASM from pandas.errors import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, @@ -116,7 +117,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser = Series([19801222, 19801222] + [19810105] * 5, dtype="float") # with NaT expected = Series( - [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + [Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan @@ -142,19 +145,32 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): # Explicit cast to float to explicit cast when setting np.nan ser = Series([198012, 198012] + [198101] * 5, dtype="float") expected = Series( - [Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5 + [Timestamp("19801201"), Timestamp("19801201")] + + [Timestamp("19810101")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan result = to_datetime(ser, format="%Y%m", cache=cache) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache): + # coercion + # GH 7930, GH 14487 + ser = Series([20121231, 20141231, 99991231]) + result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache) + expected = Series( + np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"), + dtype="M8[s]", + ) + tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 - ser = Series([20121231, 20141231, 99991231]) + ser = Series([20121231, 20141231, 999999999999999999999999999991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) - expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[s]") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -531,7 +547,8 @@ def test_to_datetime_overflow(self): res = to_datetime(arg, errors="coerce") assert res is NaT res = to_datetime([arg], errors="coerce") - tm.assert_index_equal(res, Index([NaT])) + exp = Index([NaT], dtype="M8[s]") + tm.assert_index_equal(res, exp) def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior @@ -562,7 +579,7 @@ def test_to_datetime_mixed_date_and_string(self, format): # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[ns]") + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[s]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -578,7 +595,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, with utc", ), @@ -587,7 +604,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - ), + ).as_unit("us"), id="all tz-aware, without utc", ), pytest.param( @@ -595,7 +612,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, mixed offsets, with utc", ), @@ -604,7 +621,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="tz-aware string, naive pydatetime, with utc", ), @@ -624,6 +641,8 @@ def test_to_datetime_mixed_datetime_and_string_with_format( ts1 = constructor(args[0]) ts2 = args[1] result = to_datetime([ts1, ts2], format=fmt, utc=utc) + if constructor is Timestamp: + expected = expected.as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -695,7 +714,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%m-%d %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="ISO8601, UTC", ), @@ -703,7 +722,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%d-%m %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="non-ISO8601, UTC", ), @@ -959,11 +978,12 @@ def test_to_datetime_YYYYMMDD(self): assert actual == datetime(2008, 1, 15) @td.skip_if_windows # `tm.set_timezone` does not work in windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now").as_unit("ns") + now = Timestamp("now") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -975,7 +995,8 @@ def test_to_datetime_now(self): assert pdnow.tzinfo is None assert pdnow2.tzinfo is None - @td.skip_if_windows # `tm.set_timezone` does not work in windows + @td.skip_if_windows # `tm.set_timezone` does not work on Windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @pytest.mark.parametrize("tz", ["Pacific/Auckland", "US/Samoa"]) def test_to_datetime_today(self, tz): # See GH#18666 @@ -985,12 +1006,12 @@ def test_to_datetime_today(self, tz): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): - nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + nptoday = np.datetime64("today").astype("datetime64[us]").astype(np.int64) pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today").as_unit("ns") - tstoday2 = Timestamp.today().as_unit("ns") + tstoday = Timestamp("today") + tstoday2 = Timestamp.today() # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -1007,6 +1028,7 @@ def test_to_datetime_today_now_unicode_bytes(self, arg): to_datetime([arg]) @pytest.mark.filterwarnings("ignore:Timestamp.utcnow is deprecated:FutureWarning") + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @pytest.mark.parametrize( "format, expected_ds", [ @@ -1026,7 +1048,7 @@ def test_to_datetime_now_with_format(self, format, expected_ds, string, attribut # https://github.com/pandas-dev/pandas/issues/50359 result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) expected = DatetimeIndex( - [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[s, UTC]" ) assert (expected - result).max().total_seconds() < 1 @@ -1087,11 +1109,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing result = to_datetime(dts, cache=cache) - if cache: - # FIXME: behavior should not depend on cache - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") - else: - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]") + expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") tm.assert_index_equal(result, expected) @@ -1102,14 +1120,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): to_datetime(dts_with_oob, errors="raise") result = to_datetime(dts_with_oob, errors="coerce", cache=cache) - if not cache: - # FIXME: shouldn't depend on cache! - expected = DatetimeIndex( - [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 - + [NaT], - ) - else: - expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) + expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) tm.assert_index_equal(result, expected) def test_to_datetime_tz(self, cache): @@ -1122,7 +1133,7 @@ def test_to_datetime_tz(self, cache): result = to_datetime(arr, cache=cache) expected = DatetimeIndex( ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_to_datetime_tz_mixed(self, cache): @@ -1141,7 +1152,7 @@ def test_to_datetime_tz_mixed(self, cache): result = to_datetime(arr, cache=cache, errors="coerce") expected = DatetimeIndex( - ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[s, US/Pacific]" ) tm.assert_index_equal(result, expected) @@ -1173,7 +1184,7 @@ def test_to_datetime_tz_pytz(self, cache): result = to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1260,7 +1271,7 @@ def test_to_datetime_tz_psycopg2(self, request, cache): result = to_datetime(arr, errors="coerce", utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1269,15 +1280,15 @@ def test_to_datetime_tz_psycopg2(self, request, cache): i = DatetimeIndex( ["2000-01-01 08:00:00"], tz=psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None), - ) - assert is_datetime64_ns_dtype(i) + ).as_unit("us") + assert not is_datetime64_ns_dtype(i) # tz coercion result = to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) result = to_datetime(i, errors="coerce", utc=True, cache=cache) - expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") + expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[us, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) @@ -1347,16 +1358,20 @@ def test_datetime_invalid_scalar(self, value, format): def test_datetime_outofbounds_scalar(self, value, format): # GH24763 res = to_datetime(value, errors="coerce", format=format) - assert res is NaT + if format is None: + assert isinstance(res, Timestamp) + assert res == Timestamp(value) + else: + assert res is NaT if format is not None: msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: - msg = "^Out of bounds .*, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(value, errors="raise", format=format) + res = to_datetime(value, errors="raise", format=format) + assert isinstance(res, Timestamp) + assert res == Timestamp(value) @pytest.mark.parametrize( ("values"), [(["a"]), (["00:01:99"]), (["a", "b", "99:00:00"])] @@ -1429,15 +1444,17 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected_values", + "datetimelikes,expected_values,exp_unit", ( ( (None, np.nan) + (NaT,) * start_caching_at, (NaT,) * (start_caching_at + 2), + "s", ), ( (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + "s", ), ( (None,) @@ -1445,11 +1462,12 @@ def test_to_datetime_cache_scalar(self): + ("2012 July 26", Timestamp("2012-07-26")), (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + "s", ), ), ) def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values + self, datetimelikes, expected_values, exp_unit ): # GH#39882 ser = Series( @@ -1459,7 +1477,7 @@ def test_convert_object_to_datetime_with_cache( result_series = to_datetime(ser, errors="coerce") expected_series = Series( expected_values, - dtype="datetime64[ns]", + dtype=f"datetime64[{exp_unit}]", ) tm.assert_series_equal(result_series, expected_series) @@ -1480,7 +1498,7 @@ def test_convert_object_to_datetime_with_cache( ) def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 - expected = Series([NaT] * len(input), dtype="M8[ns]") + expected = Series([NaT] * len(input), dtype="M8[s]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1531,7 +1549,17 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): # https://github.com/pandas-dev/pandas/issues/50255 ts_strings = [string_arg, outofbounds] result = to_datetime(ts_strings, errors="coerce", format=format) - expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + if isinstance(outofbounds, str) and ( + format.startswith("%B") ^ outofbounds.startswith("J") + ): + # the strings don't match the given format, so they raise and we coerce + expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[s]") + elif isinstance(outofbounds, datetime): + expected = DatetimeIndex( + [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" + ) + else: + expected = DatetimeIndex([datetime(2018, 3, 1), outofbounds], dtype="M8[s]") tm.assert_index_equal(result, expected) def test_to_datetime_malformed_no_raise(self): @@ -1542,7 +1570,9 @@ def test_to_datetime_malformed_no_raise(self): UserWarning, match="Could not infer format", raise_on_extra_warnings=False ): result = to_datetime(ts_strings, errors="coerce") - tm.assert_index_equal(result, Index([NaT, NaT])) + # TODO: should Index get "s" by default here? + exp = Index([NaT, NaT], dtype="M8[s]") + tm.assert_index_equal(result, exp) def test_to_datetime_malformed_raise(self): # GH 48633 @@ -1590,7 +1620,7 @@ def test_iso_8601_strings_with_different_offsets_utc(self): result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_utc_false_raises(self): @@ -1616,7 +1646,7 @@ def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) expected = DatetimeIndex( [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2 - ) + ).as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1637,9 +1667,11 @@ def test_timestamp_utc_true(self, ts, expected): @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 - msg = "Out of bounds nanosecond timestamp" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(dt_str, format="%Y%m%d") + res = to_datetime(dt_str, format="%Y%m%d") + dtobj = datetime.strptime(dt_str, "%Y%m%d") + expected = Timestamp(dtobj).as_unit("s") + assert res == expected + assert res.unit == expected.unit def test_to_datetime_utc(self): arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object) @@ -1722,7 +1754,7 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # In 3.0, the string "1.5" is parsed as as it would be without unit, # which fails. With errors="coerce" this becomes NaT. res = to_datetime(["1.5"], unit=unit, errors="coerce") - expected = to_datetime([NaT]) + expected = to_datetime([NaT]).as_unit("ns") tm.assert_index_equal(res, expected) # round floats are OK @@ -2145,7 +2177,7 @@ def test_dataframe_utc_true(self): df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( - np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[s]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -2357,7 +2389,9 @@ def test_to_datetime_with_space_in_series(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) + expected_coerce = Series( + [datetime(2006, 10, 18), datetime(2008, 10, 18), NaT] + ).dt.as_unit("s") tm.assert_series_equal(result_coerce, expected_coerce) @td.skip_if_not_us_locale @@ -2469,7 +2503,7 @@ def test_string_na_nat_conversion(self, cache): strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) - expected = np.empty(4, dtype="M8[ns]") + expected = np.empty(4, dtype="M8[s]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -2514,7 +2548,7 @@ def test_string_na_nat_conversion_with_name(self, cache): result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) + expected = Series(np.empty(5, dtype="M8[s]"), index=idx) for i in range(5): x = series.iloc[i] if isna(x): @@ -2554,7 +2588,7 @@ def test_dayfirst(self, cache): arr = ["10/02/2014", "11/02/2014", "12/02/2014"] expected = DatetimeIndex( [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] - ) + ).as_unit("s") idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -2578,7 +2612,7 @@ def test_dayfirst_warnings_valid_input(self): # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None ) # A. dayfirst arg correct, no warning @@ -2683,7 +2717,7 @@ def test_to_datetime_consistent_format(self, cache): ser = Series(np.array(data)) result = to_datetime(ser, cache=cache) expected = Series( - ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2695,9 +2729,7 @@ def test_to_datetime_series_with_nans(self, cache): ) ) result = to_datetime(ser, cache=cache) - expected = Series( - ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" - ) + expected = Series(["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[s]") tm.assert_series_equal(result, expected) def test_to_datetime_series_start_with_nans(self, cache): @@ -2716,7 +2748,7 @@ def test_to_datetime_series_start_with_nans(self, cache): result = to_datetime(ser, cache=cache) expected = Series( - [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2730,6 +2762,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) + expected = expected.dt.as_unit("s") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -2886,9 +2919,16 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _ = parsing.parse_datetime_string_with_reso( + result1, reso_attrname = parsing.parse_datetime_string_with_reso( date_str, yearfirst=yearfirst ) + + reso = { + "nanosecond": "ns", + "microsecond": "us", + "millisecond": "ms", + "second": "s", + }.get(reso_attrname, "s") result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -2903,7 +2943,7 @@ def test_parsers(self, date_str, expected, cache): for res in [result1, result2]: assert res == expected for res in [result3, result4, result6, result8, result9]: - exp = DatetimeIndex([Timestamp(expected)]) + exp = DatetimeIndex([Timestamp(expected)]).as_unit(reso) tm.assert_index_equal(res, exp) # these really need to have yearfirst, but we don't support @@ -2917,7 +2957,7 @@ def test_na_values_with_cache( self, cache, unique_nulls_fixture, unique_nulls_fixture2 ): # GH22305 - expected = Index([NaT, NaT], dtype="datetime64[ns]") + expected = Index([NaT, NaT], dtype="datetime64[s]") result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) @@ -3193,9 +3233,16 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-10 00:00:00", format=format) + if format is None: + res = to_datetime("2417-10-10 00:00:00.00", format=format) + assert isinstance(res, Timestamp) + assert res.year == 2417 + assert res.month == 10 + assert res.day == 10 + else: + msg = "unconverted data remains when parsing with format.*, at position 0" + with pytest.raises(ValueError, match=msg): + to_datetime("2417-10-10 00:00:00.00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", @@ -3327,7 +3374,7 @@ def test_empty_string_datetime(errors, args, format): # coerce empty string to pd.NaT result = to_datetime(td, format=format, errors=errors) - expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[ns]") + expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[s]") tm.assert_series_equal(expected, result) @@ -3367,14 +3414,12 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): ) result1 = to_datetime(ser, errors="coerce", utc=True) - expected1 = Series( - [NaT] + ([Timestamp("1991-10-20 00:00:00+00:00")] * series_length) - ) - + expected1 = Series([Timestamp(x) for x in ser]) + assert expected1.dtype == "M8[us, UTC]" tm.assert_series_equal(result1, expected1) - with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"): - to_datetime(ser, errors="raise", utc=True) + result3 = to_datetime(ser, errors="raise", utc=True) + tm.assert_series_equal(result3, expected1) def test_to_datetime_format_f_parse_nanos(): @@ -3459,7 +3504,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 vals = ["2020-01-01 00:00+00:00", ""] result = to_datetime(vals, format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[s, UTC]") tm.assert_index_equal(result, expected) # Check that a couple of other similar paths work the same way diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index c052ca58f5873..894f49b2fa140 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat import ( + IS64, + WASM, +) from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -26,7 +29,7 @@ def test_to_timedelta_dt64_raises(self): # supported GH#29794 msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" - ser = Series([pd.NaT]) + ser = Series([pd.NaT], dtype="M8[ns]") with pytest.raises(TypeError, match=msg): to_timedelta(ser) with pytest.raises(TypeError, match=msg): @@ -214,6 +217,7 @@ def test_to_timedelta_on_missing_values_list(self, val): actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") + @pytest.mark.skipif(WASM, reason="No fp exception support in WASM") @pytest.mark.xfail(not IS64, reason="Floating point error") def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index edfc1973a2bd9..dad5c73b89626 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -23,7 +23,6 @@ date_range, period_range, ) -import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -202,17 +201,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): assert frequencies.infer_freq(index) is None -@pytest.mark.parametrize( - "freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")] -) -def test_infer_freq_index(freq, expected): - rng = period_range("1959Q2", "2009Q3", freq=freq) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - rng = Index(rng.to_timestamp("D", how="e").astype(object)) - - assert rng.inferred_freq == expected - - @pytest.mark.parametrize( "expected,dates", list( diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 99829857e6836..90e2e117852a2 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -57,10 +57,10 @@ def __init__(self, name=None, rules=None) -> None: jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) # Getting holidays for Jan 1 should not alter results for Jan 2. - expected = DatetimeIndex(["01-Jan-2015"]).as_unit("ns") + expected = DatetimeIndex(["01-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan1.holidays(), expected) - expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("ns") + expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan2.holidays(), expected2) diff --git a/pandas/tests/tseries/offsets/test_common.py b/pandas/tests/tseries/offsets/test_common.py index 3792878973c15..34181f28bb1a0 100644 --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -9,6 +9,7 @@ ) from pandas.compat import ( IS64, + WASM, is_platform_windows, ) @@ -106,6 +107,7 @@ def _offset(request): return request.param +@pytest.mark.skipif(WASM, reason="OverflowError received on WASM") def test_apply_out_of_range(request, tz_naive_fixture, _offset): tz = tz_naive_fixture @@ -130,7 +132,11 @@ def test_apply_out_of_range(request, tz_naive_fixture, _offset): if tz is not None: assert t.tzinfo is not None - if isinstance(tz, tzlocal) and not IS64 and _offset is not DateOffset: + if ( + isinstance(tz, tzlocal) + and ((not IS64) or WASM) + and _offset is not DateOffset + ): # If we hit OutOfBoundsDatetime on non-64 bit machines # we'll drop out of the try clause before the next test request.applymarker( diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 35b72c9bb2887..3c55ae2c6f904 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -15,7 +15,6 @@ tslib, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm @@ -156,7 +155,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -174,6 +173,8 @@ def test_parsing_timezone_offsets(dt_string, expected_tz): # to the same datetime after the timezone offset is added. arr = np.array(["01-01-2013 00:00:00"], dtype=object) expected, _ = tslib.array_to_datetime(arr) + if "000000000" in dt_string: + expected = expected.astype("M8[ns]") arr = np.array([dt_string], dtype=object) result, result_tz = tslib.array_to_datetime(arr) @@ -206,38 +207,35 @@ def test_parsing_different_timezone_offsets(): @pytest.mark.parametrize( - "invalid_date", + "invalid_date,exp_unit", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01"), + (date(1000, 1, 1), "s"), + (datetime(1000, 1, 1), "us"), + ("1000-01-01", "s"), + ("Jan 1, 1000", "s"), + (np.datetime64("1000-01-01"), "s"), ], ) @pytest.mark.parametrize("errors", ["coerce", "raise"]) -def test_coerce_outside_ns_bounds(invalid_date, errors): +def test_coerce_outside_ns_bounds(invalid_date, exp_unit, errors): arr = np.array([invalid_date], dtype="object") - kwargs = {"values": arr, "errors": errors} - if errors == "raise": - msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" + result, _ = tslib.array_to_datetime(arr, errors=errors) + out_reso = np.datetime_data(result.dtype)[0] + assert out_reso == exp_unit + ts = Timestamp(invalid_date) + assert ts.unit == exp_unit - with pytest.raises(OutOfBoundsDatetime, match=msg): - tslib.array_to_datetime(**kwargs) - else: # coerce. - result, _ = tslib.array_to_datetime(**kwargs) - expected = np.array([iNaT], dtype="M8[ns]") - - tm.assert_numpy_array_equal(result, expected) + expected = np.array([ts._value], dtype=f"M8[{exp_unit}]") + tm.assert_numpy_array_equal(result, expected) def test_coerce_outside_ns_bounds_one_valid(): arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = [iNaT, "2000-01-01T00:00:00.000000000"] - expected = np.array(expected, dtype="M8[ns]") + expected = ["1000-01-01T00:00:00.000000000", "2000-01-01T00:00:00.000000000"] + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -247,7 +245,13 @@ def test_coerce_of_invalid_datetimes(): # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) + + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] + + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) def test_to_datetime_barely_out_of_bounds(): @@ -292,5 +296,5 @@ def test_datetime_subclass(klass): arr = np.array([klass(2000, 1, 1)], dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(["2000-01-01T00:00:00.000000000"], dtype="M8[ns]") + expected = np.array(["2000-01-01T00:00:00.000000"], dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 52af5adb686a7..9b64beaf09273 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -17,6 +17,7 @@ from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso from pandas.compat import ( ISMUSL, + WASM, is_platform_windows, ) import pandas.util._test_decorators as td @@ -29,6 +30,7 @@ from pandas._testing._hypothesis import DATETIME_NO_TZ +@pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @pytest.mark.skipif( is_platform_windows() or ISMUSL, reason="TZ setting incorrect on Windows and MUSL Linux", diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index ad4e9e2bcf38a..07bdfca8f2f2d 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -176,6 +176,14 @@ def test_anchored_shortcuts(shortcut, expected): assert result == expected +def test_to_offset_lowercase_frequency_w_deprecated(): + # GH#54939 + msg = "'w' is deprecated and will be removed in a future version" + + with tm.assert_produces_warning(FutureWarning, match=msg): + to_offset("2w") + + @pytest.mark.parametrize( "freq_depr", [ @@ -185,18 +193,16 @@ def test_anchored_shortcuts(shortcut, expected): "2qs-feb", "2bqs", "2sms", + "1sme", "2bms", "2cbme", "2me", - "2w", ], ) -def test_to_offset_lowercase_frequency_deprecated(freq_depr): - # GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.upper()[1:]}' instead." +def test_to_offset_lowercase_frequency_raises(freq_depr): + msg = f"Invalid frequency: {freq_depr}" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): to_offset(freq_depr) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index a54e0071aa006..e654534ccd453 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -260,14 +260,14 @@ def test_categorical_consistency(s1, categorize): tm.assert_series_equal(h1, h3) -def test_categorical_with_nan_consistency(): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") - ) - expected = hash_array(c, categorize=False) - - c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) - result = hash_array(c, categorize=False) +def test_categorical_with_nan_consistency(unit): + dti = pd.date_range("2012-01-01", periods=5, name="B", unit=unit) + cat = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=dti) + expected = hash_array(cat, categorize=False) + + ts = pd.Timestamp("2012-01-01").as_unit(unit) + cat2 = pd.Categorical.from_codes([-1, 0], categories=[ts]) + result = hash_array(cat2, categorize=False) assert result[0] in expected assert result[1] in expected diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 510a69a2ff3e4..b4a045cd26fe4 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -550,7 +550,7 @@ def test_expanding_cov_pairwise_diff_length(): df2a = DataFrame( [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") ) - # TODO: xref gh-15826 + # xref gh-15826 # .loc is not preserving the names result1 = df1.expanding().cov(df2, pairwise=True).loc[2] result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] @@ -691,12 +691,3 @@ def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): op2 = getattr(expanding2, kernel) expected = op2(*arg2, numeric_only=numeric_only) tm.assert_series_equal(result, expected) - - -def test_keyword_quantile_deprecated(): - # GH #52550 - ser = Series([1, 2, 3, 4]) - with tm.assert_produces_warning( - FutureWarning, match="the 'quantile' keyword is deprecated, use 'q' instead" - ): - ser.expanding().quantile(quantile=0.5) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 650eb911e410b..23b17c651f08d 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -67,6 +67,21 @@ def f(x, *args): ) tm.assert_series_equal(result, expected) + def test_numba_min_periods(self): + # GH 58868 + def last_row(x): + assert len(x) == 3 + return x[-1] + + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]]) + + result = df.rolling(3, method="table", min_periods=3).apply( + last_row, raw=True, engine="numba" + ) + + expected = DataFrame([[np.nan, np.nan], [np.nan, np.nan], [5, 6], [7, 8]]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "data", [ @@ -304,7 +319,9 @@ def f(x): @td.skip_if_no("numba") def test_invalid_kwargs_nopython(): - with pytest.raises(NumbaUtilError, match="numba does not support kwargs with"): + with pytest.raises( + NumbaUtilError, match="numba does not support keyword-only arguments" + ): Series(range(1)).rolling(1).apply( lambda x: x, kwargs={"a": 1}, engine="numba", raw=True ) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 85821ed2cfb6f..fc8d7f69b8180 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -10,6 +10,7 @@ IS64, is_platform_arm, is_platform_power, + is_platform_riscv64, ) from pandas import ( @@ -1081,7 +1082,7 @@ def test_rolling_sem(frame_or_series): @pytest.mark.xfail( - is_platform_arm() or is_platform_power(), + is_platform_arm() or is_platform_power() or is_platform_riscv64(), reason="GH 38921", ) @pytest.mark.parametrize( diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py index 1604d72d4f9b1..66713f1cfaa8d 100644 --- a/pandas/tests/window/test_rolling_quantile.py +++ b/pandas/tests/window/test_rolling_quantile.py @@ -173,12 +173,3 @@ def test_center_reindex_frame(frame, q): ) frame_rs = frame.rolling(window=25, center=True).quantile(q) tm.assert_frame_equal(frame_xp, frame_rs) - - -def test_keyword_quantile_deprecated(): - # GH #52550 - s = Series([1, 2, 3, 4]) - with tm.assert_produces_warning( - FutureWarning, match="the 'quantile' keyword is deprecated, use 'q' instead" - ): - s.rolling(2).quantile(quantile=0.4) diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 6cdd96996cea6..c4fec39594407 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -115,6 +115,11 @@ def show_versions(as_json: str | bool = False) -> None: Info will be written to that file in JSON format. * If True, outputs info in JSON format to the console. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + Examples -------- >>> pd.show_versions() # doctest: +SKIP diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index d4a79cae61772..48684c4810d2a 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -39,6 +39,7 @@ def test_foo(): from pandas.compat import ( IS64, + WASM, is_platform_windows, ) from pandas.compat._optional import import_optional_dependency @@ -115,6 +116,10 @@ def skip_if_no(package: str, min_version: str | None = None) -> pytest.MarkDecor locale.getlocale()[0] != "en_US", reason=f"Set local {locale.getlocale()[0]} is not en_US", ) +skip_if_wasm = pytest.mark.skipif( + WASM, + reason="does not support wasm", +) def parametrize_fixture_doc(*args) -> Callable[[F], F]: diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 494f306ec807d..c0e9756372f47 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -27,6 +27,10 @@ def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> Non both doctests/regular tests, just append "--doctest-modules"/"--doctest-cython" to extra_args. + See Also + -------- + pytest.main : The main entry point for pytest testing framework. + Examples -------- >>> pd.test() # doctest: +SKIP diff --git a/pyproject.toml b/pyproject.toml index 085c054f8241a..e7d7474134c3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -722,5 +722,5 @@ exclude_lines = [ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru" ignore-regex = 'https://([\w/\.])+' diff --git a/requirements-dev.txt b/requirements-dev.txt index 6c5764bf589cc..f5da7f70ccdba 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -64,7 +64,6 @@ natsort numpydoc pydata-sphinx-theme==0.14 pytest-cython -docutils < 0.21 sphinx sphinx-design sphinx-copybutton diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index bb15b8f997b11..7f5f0326eba6c 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -285,3 +285,4 @@ hope can help clarify our meaning here: [51417]: https://github.com/pandas-dev/pandas/pull/51417 [28900]: https://github.com/pandas-dev/pandas/issues/28900 [35407]: https://github.com/pandas-dev/pandas/issues/35407 +[53576]: https://github.com/pandas-dev/pandas/pull/53576