Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into mingyuanm/flux_controlnet_sharded…
Browse files Browse the repository at this point in the history
…_dict

# Conflicts:
#	nemo/collections/diffusion/data/diffusion_mock_datamodule.py
#	nemo/collections/diffusion/models/flux/model.py
#	nemo/collections/llm/peft/api.py
#	nemo/lightning/_strategy_lib.py
#	nemo/lightning/megatron_parallel.py
#	scripts/dit/dit_train.py
#	scripts/flux/flux_controlnet_infer.py
#	scripts/flux/flux_controlnet_training.py
#	scripts/flux/flux_infer.py
#	scripts/flux/flux_training.py
  • Loading branch information
Victor49152 committed Jan 21, 2025
2 parents 510c16c + 066e4b4 commit abede80
Show file tree
Hide file tree
Showing 142 changed files with 2,563 additions and 672 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
steps:
- name: Docker system cleanup
run: |
docker system prune -a --filter "until=48h" --force || true
docker system prune -af --filter "until=24h" --force || true
- name: Docker pull image
run: |
Expand Down
97 changes: 87 additions & 10 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2937,7 +2937,7 @@ jobs:
with:
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand Down Expand Up @@ -2965,6 +2965,7 @@ jobs:
+model.tp_comm_overlap_ag=False \
+model.tp_comm_overlap_rs=False \
+model.tp_comm_overlap_disable_qkv=True \
+model.attention_backend="unfused" \
model.peft.peft_scheme="lora" \
model.peft.lora_tuning.adapter_dim=16 \
model.peft.lora_tuning.alpha=32 \
Expand Down Expand Up @@ -3622,7 +3623,7 @@ jobs:
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand All @@ -3633,16 +3634,17 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --strategy fsdp --devices 2
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --strategy fsdp --devices 2
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_VLM_HF_Transformer_PEFT_4bit:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_4bit') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --use-4bit
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --use-4bit
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand All @@ -3653,7 +3655,7 @@ jobs:
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --disable-ckpt
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft_hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand All @@ -3675,7 +3677,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp --disable-ckpt
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft_hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp --disable-ckpt
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand Down Expand Up @@ -4328,11 +4330,24 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/vlm/neva_train.py \
python tests/collections/vlm/test_neva_train.py \
--devices=1 \
--max-steps=5 \
--experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }}
L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/vlm/test_neva_train.py \
--devices=1 \
--max-steps=5 \
--experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }} \
--use_packed_sequence
L2_NeMo_2_MLLAMA_MOCK_TRAINING:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand All @@ -4341,7 +4356,7 @@ jobs:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 \
python tests/collections/vlm/mllama_train.py \
python tests/collections/vlm/test_mllama_train.py \
--devices=1 \
--max-steps=5 \
--experiment-dir=/tmp/nemo2_mllama_results/${{ github.run_id }}
Expand All @@ -4353,7 +4368,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
python3 tests/collections/llm/megatron_mixtral_pretraining.py \
--experiment-dir=/tmp/mixtral_pretrain_results \
--data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
Expand Down Expand Up @@ -4870,6 +4885,35 @@ jobs:
rm -rf /tmp/nemo2_ckpt
rm -rf /tmp/nemo2_ptq_engine
L2_NeMo_2_Export_In_Framework:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Export_In_Framework') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/test_hf_import.py \
--hf_model /home/TestData/nlp/megatron_llama/llama-ci-hf \
--output_path /tmp/nemo2_ckpt
python tests/setup/data/create_sample_lambada.py \
--output_file /tmp/lambada.json
python tests/export/nemo_export.py \
--model_name test \
--model_type llama \
--checkpoint_dir /tmp/nemo2_ckpt \
--min_tps 1 \
--in_framework True \
--test_deployment True \
--run_accuracy True \
--test_data_path /tmp/lambada.json \
--accuracy_threshold 0.0 \
--debug
AFTER_SCRIPT: |
rm -rf /tmp/nemo2_ckpt /tmp/lambada.json
L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand All @@ -4885,6 +4929,36 @@ jobs:
AFTER_SCRIPT: |
rm -rf /tmp/nemo2_llava_next_results
L2_NeMo_2_VLLM_EXPORT:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_VLLM_EXPORT') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/setup/models/create_hf_model.py \
--model_name_or_path /home/TestData/nlp/megatron_llama/llama-ci-hf \
--output_dir /tmp/llama_head64 \
--config_updates "{\"hidden_size\": 512, \"num_attention_heads\": 4, \"numx_hidden_layers\": 2, \"num_key_value_heads\": 4, \"intermediate_size\": 1024, \"head_dim\": 128, \"num_hidden_layers\": 2, \"torch_dtype\": \"float16\" }"
python tests/collections/llm/test_hf_import.py --hf_model /tmp/llama_head64 --output_path /tmp/nemo2_ckpt
/opt/venv/bin/python tests/export/nemo_export.py \
--min_tps 1 \
--max_tps 1 \
--use_vllm True \
--model_type llama \
--max_output_len 128 \
--test_deployment True \
--model_name nemo2_ckpt \
--model_dir /tmp/vllm_from_nemo2 \
--checkpoint_dir /tmp/nemo2_ckpt
AFTER_SCRIPT: |
rm -rf /tmp/llama_head64
rm -rf /tmp/nemo2_ckpt
rm -rf /tmp/vllm_from_nemo2
Nemo_CICD_Test:
needs:
- pre-flight
Expand Down Expand Up @@ -5000,6 +5074,7 @@ jobs:
- Speech_Checkpoints_tests
- L2_Stable_Diffusion_Training
- L2_NeMo_2_NEVA_MOCK_TRAINING
- L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING
- L2_NeMo_2_MLLAMA_MOCK_TRAINING
- L2_NeMo_2_GPT_Pretraining_no_transformer_engine
- L2_NeMo_2_GPT_DDP_Param_Parity_check
Expand Down Expand Up @@ -5067,10 +5142,12 @@ jobs:
- L2_Megatron_GPT_Reranker
- L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact
- L2_NeMo_2_PTQ_Llama2_FP8
- L2_NeMo_2_Export_In_Framework
- L2_NeMo_2_jit_callback
- L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
- L2_HF_Transformer_SFT_FSDP2_2gpu
- L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
- L2_NeMo_2_VLLM_EXPORT
if: always()
runs-on: ubuntu-latest
steps:
Expand Down
83 changes: 33 additions & 50 deletions .github/workflows/import-test.yml
Original file line number Diff line number Diff line change
@@ -1,73 +1,56 @@
name: CI-Import-Check

on:
push:
pull_request:
paths:
- "**"

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

# Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags
jobs:

test-asr-imports:
runs-on: ubuntu-latest
container:
image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
test-imports:
name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
collection:
- asr
# - nlp # Currently broken
- tts
python: ['3.10', '3.11', '3.12']
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Update base dependencies
run: |
apt-get update && apt-get install -y build-essential
apt-get install -y libsndfile1 make
- name: Install nemo dependencies
- uses: actions/setup-python@v5
with:
python-version: '${{ matrix.python }}'
- name: Build wheel
id: nemo-wheel
run: |
pip install Cython
# install test requirements
pip install -r requirements/requirements_test.txt
# Build nemo as a wheel
pip install build
python -m build --no-isolation --wheel
python -m build --wheel
# Preserve wheel location
DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
echo "::set-output name=DIST_FILE::${DIST_FILE}"
- name: Test ASR Domain Imports
run: |
# Install NeMo Domain
pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]"
# Run import checks
python tests/core_ptl/check_imports.py --domain "asr"
# Uninstall NeMo
pip uninstall -y nemo_toolkit
test-tts-imports:
runs-on: ubuntu-latest
container:
image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Update base dependencies
echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT"
- name: Install NeMo + test dependencies
run: |
apt-get update && apt-get install -y build-essential
apt-get install -y libsndfile1 make
- name: Install nemo dependencies
id: nemo-wheel
run: |
pip install Cython
# install test requirements
pip install -r requirements/requirements_test.txt
# Build nemo as a wheel
pip install build
python -m build --no-isolation --wheel
# Preserve wheel location
DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
echo "::set-output name=DIST_FILE::${DIST_FILE}"
- name: Test TTS Domain Imports
run: |
# Install NeMo Domain
pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]"
pip install --no-cache-dir "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]"
- name: Run ${{ matrix.collection }} checks
run: |
# Run import checks
python tests/core_ptl/check_imports.py --domain "tts"
# Uninstall NeMo
pip uninstall -y nemo_toolkit
python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}"

4 changes: 2 additions & 2 deletions .github/workflows/mcore-tag-bump-bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
source-ref: main
build-arg: MCORE_TAG
dockerfile: Dockerfile.ci
base-branch: main
base-branch: weekly-bump
cicd-label: Run CICD
pr-reviewers: 'pablo-garay'
secrets:
PAT: ${{ secrets.PAT }}
PAT: ${{ secrets.PAT }}
Loading

0 comments on commit abede80

Please sign in to comment.