Skip to content

Commit

Permalink
Update on "Expose Storage use_count API in Python"
Browse files Browse the repository at this point in the history
Needed by pytorch/torchtune#1936

In favor over #139109, as exposing an existing API is better than adding a new one (and this enables a more robust fix)




[ghstack-poisoned]
  • Loading branch information
janeyx99 committed Nov 1, 2024
2 parents 09a268b + 35501b2 commit fb463e3
Show file tree
Hide file tree
Showing 720 changed files with 17,883 additions and 6,812 deletions.
6 changes: 0 additions & 6 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,6 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping sccache due to the following issue
# https://github.com/pytorch/pytorch/issues/121559
SKIP_SCCACHE_INSTALL=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
Expand All @@ -429,9 +426,6 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping sccache due to the following issue
# https://github.com/pytorch/pytorch/issues/121559
SKIP_SCCACHE_INSTALL=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
Expand Down
51 changes: 44 additions & 7 deletions .ci/docker/common/install_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@ install_ubuntu() {
# Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
apt-get install -y cargo
echo "Checking out sccache repo"
git clone https://github.com/pytorch/sccache
if [ -n "$CUDA_VERSION" ]; then
# TODO: Remove this
git clone https://github.com/pytorch/sccache
else
git clone https://github.com/mozilla/sccache -b v0.8.2
fi
cd sccache
echo "Building sccache"
cargo build --release
Expand All @@ -19,6 +24,10 @@ install_ubuntu() {
rm -rf sccache
apt-get remove -y cargo rustc
apt-get autoclean && apt-get clean

echo "Downloading old sccache binary from S3 repo for PCH builds"
curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /opt/cache/bin/sccache-0.2.14a
chmod 755 /opt/cache/bin/sccache-0.2.14a
}

install_binary() {
Expand All @@ -36,18 +45,46 @@ if [ -n "$ROCM_VERSION" ]; then
curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
else
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
# TODO: Install the pre-built binary from S3 as building from source
# https://github.com/pytorch/sccache has started failing mysteriously
# in which sccache server couldn't start with the following error:
# sccache: error: Invalid argument (os error 22)
install_binary
if [ -n "$CUDA_VERSION" ]; then
# TODO: Install the pre-built binary from S3 as building from source
# https://github.com/pytorch/sccache has started failing mysteriously
# in which sccache server couldn't start with the following error:
# sccache: error: Invalid argument (os error 22)
install_binary
else
install_ubuntu
fi
fi
chmod a+x /opt/cache/bin/sccache

function write_sccache_stub() {
# Unset LD_PRELOAD for ps because of asan + ps issues
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n exec sccache $(which $1) \"\$@\"\nelse\n exec $(which $1) \"\$@\"\nfi" > "/opt/cache/bin/$1"
if [ $1 == "gcc" ]; then
# Do not call sccache recursively when dumping preprocessor argument
# For some reason it's very important for the first cached nvcc invocation
cat > "/opt/cache/bin/$1" <<EOF
#!/bin/sh
if [ "\$1" = "-E" ] || [ "\$2" = "-E" ]; then
exec $(which $1) "\$@"
elif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
exec sccache $(which $1) "\$@"
else
exec $(which $1) "\$@"
fi
EOF
else
cat > "/opt/cache/bin/$1" <<EOF
#!/bin/sh
if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
exec sccache $(which $1) "\$@"
else
exec $(which $1) "\$@"
fi
EOF
fi
chmod a+x "/opt/cache/bin/$1"
}

Expand Down
7 changes: 7 additions & 0 deletions .ci/docker/common/install_user.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

set -ex

# Since version 24 the system ships with user 'ubuntu' that has id 1000
# We need a work-around to enable id 1000 usage for this script
if [[ $UBUNTU_VERSION == 24.04 ]]; then
# touch is used to disable harmless error message
touch /var/mail/ubuntu && chown ubuntu /var/mail/ubuntu && userdel -r ubuntu
fi

# Mirror jenkins user in container
# jenkins user as ec2-user should have the same user-id
echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
Expand Down
3 changes: 1 addition & 2 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ numba==0.55.2 ; python_version == "3.10"
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
#test_binary_ufuncs.py
numpy==1.21.2; python_version == "3.9"
numpy==1.22.4; python_version == "3.10"
numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
numpy==2.1.2; python_version >= "3.13"

Expand Down
4 changes: 1 addition & 3 deletions .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,6 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
python tools/stats/export_test_times.py
fi

# snadampal: skipping it till sccache support added for aarch64
# https://github.com/pytorch/pytorch/issues/121559
if [[ "$BUILD_ENVIRONMENT" != *aarch64* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *s390x* ]]; then
print_sccache_stats
fi
6 changes: 6 additions & 0 deletions .ci/pytorch/common-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
# Save the absolute path in case later we chdir (as occurs in the gpu perf test)
script_dir="$( cd "$(dirname "${BASH_SOURCE[0]}")" || exit ; pwd -P )"

if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
# This is really weird, but newer sccache somehow produces broken binary
# see https://github.com/pytorch/pytorch/issues/139188
sudo mv /opt/cache/bin/sccache-0.2.14a /opt/cache/bin/sccache
fi

if which sccache > /dev/null; then
# Save sccache logs to file
sccache --stop-server > /dev/null 2>&1 || true
Expand Down
36 changes: 22 additions & 14 deletions .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -384,22 +384,30 @@ test_inductor_cpp_wrapper() {
# unit tests with cpp wrapper.
python test/run_test.py --include inductor/test_torchinductor.py --verbose

python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \

# Run inductor benchmark tests with cpp wrapper.
# Skip benchmark tests if it's in rerun-disabled-mode.
if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then
echo "skip dynamo benchmark tests for rerun-disabled-test"
else
echo "run dynamo benchmark tests with cpp wrapper"
python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
--training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
--output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"

python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"

python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
fi
}

# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
Expand Down
3 changes: 2 additions & 1 deletion .ci/pytorch/win-test-helpers/build_pytorch.bat
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ if not errorlevel 0 goto fail

if "%USE_XPU%"=="1" (
:: Activate xpu environment - VS env is required for xpu
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat"
call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat"
if errorlevel 1 exit /b 1
:: Reduce build time. Only have MTL self-hosted runner now
SET TORCH_XPU_ARCH_LIST=xe-lpg
Expand Down
4 changes: 4 additions & 0 deletions .github/actions/build-android/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ runs:
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
DOCKER_IMAGE: ${{ inputs.docker-image }}
MATRIX_ARCH: ${{ inputs.arch }}
run: |
# detached container should get cleaned up by teardown_ec2_linux
set -exo pipefail
# Fetch aws credential from IMDs
eval "$(python3 .github/scripts/get_aws_session_tokens.py)"
export container_name
container_name=$(docker run \
-e BUILD_ENVIRONMENT \
Expand All @@ -56,6 +59,7 @@ runs:
-e SHA1 \
-e BRANCH \
-e SCCACHE_BUCKET \
-e SCCACHE_REGION \
-e SKIP_SCCACHE_INITIALIZATION=1 \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
Expand Down
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ciflow_push_tags:
- ciflow/binaries_libtorch
- ciflow/binaries_wheel
- ciflow/inductor
- ciflow/inductor-periodic
- ciflow/inductor-rocm
- ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
Expand Down
2 changes: 0 additions & 2 deletions .github/requirements/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ The list of support files are as follows:
conda environment
* conda-env-macOS-ARM64. This is used by MacOS (m1, arm64) build and
test jobs to setup the conda environment
* conda-env-macOS-X64. This is use by MacOS (x86-64) build and test
jobs to setup the conda environment
* conda-env-Linux-X64. This is used by Linux buck build and test jobs
to setup the conda environment
* Pip:
Expand Down
4 changes: 2 additions & 2 deletions .github/requirements/conda-env-Linux-X64.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ mkl-include=2022.1.0
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0
setuptools=68.2.2
typing-extensions=4.9.0
setuptools=72.1.0
typing-extensions=4.11.0
2 changes: 1 addition & 1 deletion .github/requirements/conda-env-iOS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ cmake=3.22.1
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0
setuptools=68.2.2
setuptools=72.1.0
typing-extensions=4.11.0
4 changes: 2 additions & 2 deletions .github/requirements/conda-env-macOS-ARM64
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
numpy=1.22.3
pyyaml=6.0
setuptools=61.2.0
setuptools=72.1.0
cmake=3.22.*
typing-extensions=4.9.0
typing-extensions=4.11.0
dataclasses=0.8
pip=22.2.2
pillow=10.0.1
Expand Down
16 changes: 0 additions & 16 deletions .github/requirements/conda-env-macOS-X64

This file was deleted.

Loading

0 comments on commit fb463e3

Please sign in to comment.