Documentation for LUMI - fineweb ablation studies +

Config and slurm script for Fineweb-edu ablation study replication + .gitingore
Vmjkom · Sep 24, 2024 · bab1090 · bab1090
1 parent 238c50d
commit bab1090
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 36 deletions.
diff --git a/.gitignore b/.gitignore
@@ -168,5 +168,9 @@ logs/*.out
 hostfiles/*
 pythonuserbase
 core
-tools/data_preprocess-8035567.out
-tools/data_preprocess-8035567.err
+
+#Logs
+*.out
+*.err
+slurm_scripts/debug.sh
+configs/debug.yaml
diff --git a/README.md b/README.md
@@ -31,23 +31,80 @@ Nanotron is a library for pretraining transformer models. It provides a simple a
 - **Simplicity**: Nanotron is designed to be easy to use. It provides a simple and flexible API to pretrain models on custom datasets.
 - **Performance**: Optimized for speed and scalability, Nanotron uses the latest techniques to train models faster and more efficiently.
 
-## Installation
-
+# LUMI
+## Setup
+You can do the following on a login node, as all of the gpu related installations are arleady in the module/contaer we use
 ```bash
-# Requirements: Python>=3.10
-git clone https://github.com/huggingface/nanotron
-cd nanotron
+module purge
+
+# Get access to the csc provided modules
+module use /appl/local/csc/modulefiles #Consider adding this to your .bashrc or .profile
+module load pytorch/2.4 #As of 24.9.2024 the latest is this. The previous versions propably wont work
+
+#Right now we use a naughty virtual enviroment, but expect this to change to a fully containerized enviroment
+
+python3 -m venv .venv --system-site-packages 
+source .venv/bin/activate
 pip install --upgrade pip
-pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
-pip install -e .
+pip install -e .[nanosets]
+```
+## Data
+If your dataset is available in huggingface format you set it in your .yaml config file like so:
+```yaml
+data_stages:
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 7
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets:
+          roneneldan/TinyStories: 0.5
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 0
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+
+```
+### Preprocess
+Larger datasets can be preprocessed with [`/tools/preprocess_data.py`](/tools/preprocess_data.py). This is a script that read in and process a large dataset in various ways, in a parallel fashion. This is done with the [`datatrove library`](https://github.com/huggingface/datatrove) 
+
+These preprocessed datasets are called "nanosets" and are configured in the yaml file a little differently:
+```yaml
+data_stages:
+  - data:
+      dataset:
+        dataset_folder: /scratch/project_462000353/data/nanosets/fineweb-edu/350BT
+      num_loading_workers: 7
+      seed: 42
+    name: Stable Training Stage
+    start_training_step: 1
 
-# Install dependencies if you want to use the example scripts
-pip install datasets transformers
-pip install triton "flash-attn>=2.5.0" --no-build-isolation
 ```
-> [!NOTE]
-> If you get `undefined symbol: ncclCommRegister` error you should install torch 2.1.2 instead: `pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu121`
+More info for these is in [`/tools/nanoset.md`](/tools/nanoset.md).
+
+See all of the dataset related configuration parameters in [`config.py`](/src/nanotron/config/config.py)
+
+## Fineweb ablations
+If your wish is to do pretraining for a fineweb-like ablation study, you can follow these steps:
+Modify the [`llama_2B.yaml`](/configs/llama_2B.yaml)config file to point to your own datasets, directories for checkpoints etc.
+The model parameters should be left untouched if you want to replicate the 1.82B llama model huggingface used.
+Modify [`slurm_script`](/slurm_scripts/train.sh) and add your config file ``export CONFIG=$DIR/configs/llama_2B.yaml``
+```bash
+sbatch /slurm_scripts/train.sh
+
+#Or for quick debugging launch an interactive session with salloc
+#PARAMS: 2 nodes, 30 minutes run time, job name
+./slurm_scripts/interactive.sh 2 00:30:00 debug-nanotron
 
+#And then to launch after your resources have been allocated
+./slurm_scripts/train.sh
+```
+## TODO
+- [ ] Implement [`lighteval`](/src/nanotron/config/lighteval_config.py) into the pretraining
+- [ ] Others?
+# End of LUMI spesific README
 > [!TIP]
 > We log to wandb automatically if it's installed. For that you can use `pip install wandb`. If you don't want to use wandb, you can run `wandb disabled`.
 

diff --git a/configs/llama_2B.yaml b/configs/llama_2B.yaml
@@ -1,15 +1,14 @@
 checkpoints:
-  checkpoint_interval: 50
+  checkpoint_interval: 2000
   checkpoints_path: /scratch/project_462000353/villekom/checkpoints/nanotron
   checkpoints_path_is_shared_file_system: true
   resume_checkpoint_path: null
   save_initial_state: false
 data_stages:
   - data:
       dataset:
-        #dataset_folder: /scratch/project_462000353/data/fineweb/nanosets
-        hf_dataset_or_datasets: stas/openwebtext-10k
-      num_loading_workers: 56
+        dataset_folder: /scratch/project_462000353/data/nanosets/fineweb-edu/350BT
+      num_loading_workers: 7
       seed: 42
     name: Stable Training Stage
     start_training_step: 1
@@ -23,9 +22,9 @@ general:
   step: null
 lighteval: null
 logging:
-  iteration_step_info_interval: 1
+  iteration_step_info_interval: 10
   log_level: info
-  log_level_replica: info
+  log_level_replica: warning
 model:
   ddp_bucket_cap_mb: 25
   dtype: bfloat16
@@ -67,10 +66,10 @@ optimizer:
     adam_eps: 1.0e-08
     name: adamW
     torch_adam_is_fused: true
-  weight_decay: 0.01
+  weight_decay: 0.1
   zero_stage: 0
 parallelism:
-  dp: 16
+  dp: 64 #This should be equal to amount of gpu's if tp and pp == 1
   expert_parallel_size: 1
   pp: 1
   pp_engine: 1f1b
@@ -80,13 +79,13 @@ parallelism:
 profiler: null
 tokenizer:
   tokenizer_max_length: null
-  tokenizer_name_or_path: HuggingFaceFW/ablation-model-fineweb-v1
+  tokenizer_name_or_path: HuggingFaceFW/ablation-model-fineweb-edu
   tokenizer_revision: null
 tokens:
-  batch_accumulation_per_replica: 1
+  batch_accumulation_per_replica: 4
   limit_test_batches: 0
   limit_val_batches: 0
-  micro_batch_size: 1
+  micro_batch_size: 4
   sequence_length: 2048
-  train_steps: 10
-  val_check_interval: -1
+  train_steps: 167000 #350B tokens if on 1M global batch size on fineweb-edu
+  val_check_interval: 100
diff --git a/slurm_scripts/train.sh b/slurm_scripts/train.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
-#SBATCH -J debug_nanotron_LLAMA2_7B_2N
+#SBATCH -J nanotron_LLAMA2_fw-edu-350BT_8N
 #SBATCH --cpus-per-task=7
-#SBATCH --nodes=2
+#SBATCH --nodes=8
 #SBATCH --ntasks-per-node=8
 #SBATCH --mem=480G
-#SBATCH --partition=dev-g
-#SBATCH --time=00:30:00
+#SBATCH --partition=standard-g
+#SBATCH --time=12:00:00
 #SBATCH --gpus-per-node=mi250:8
 #SBATCH --exclusive
-#SBATCH --account=project_462000615
+#SBATCH --account=project_462000353
 #SBATCH --output=logs/%x-%j.out
 #SBATCH --error=logs/%x-%j.err
 
@@ -18,27 +18,39 @@ set -eox pipefail
 ln -f -s $SLURM_JOB_NAME-$SLURM_JOB_ID.out logs/latest.out
 ln -f -s $SLURM_JOB_NAME-$SLURM_JOB_ID.err logs/latest.err
 
-DIR="/projappl/project_462000353/villekom/nanotron"
+DIR="/projappl/project_462000353/villekom/nanotron" #Change this
 
+#MODULES
 module purge
 ml use /appl/local/csc/modulefiles/
 ml pytorch/2.4
+
+#At the moment we use a venv built on top of the pytorch module. Expect this to change to a fully containerized setup.
+#Activate bin
 source $DIR/.venv/bin/activate
 export PYTHONPATH=/projappl/project_462000353/villekom/nanotron/.venv/lib/python3.10/site-packages
 echo "NGPUS" $SLURM_GPUS_ON_NODE
-export NCCL_IFNAME=hsn
-export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+#NETWORK
+export NCCL_IFNAME=hsn #This makes sure RCCL selects the fastest network switch
+export CUDA_DEVICE_MAX_CONNECTIONS=1 #Needed for some of nanotron's paralellism techniques
+
+#DISTRIBUTED
 export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export MASTER_PORT=9999
+
+#LOGGING
 #export NANOTRON_BENCHMARK=1 #Logs throughput
+export PYTHONWARNINGS="ignore" #Get rid off verbosity
+
 #export HF_TOKEN="TOKEN HERE FROM HF"
 export CONFIG=$DIR/configs/llama_2B.yaml
 
 #Debugging
 #export NCCL_DEBUG=INFO
 
 #Masks for binding cpu cores to right numa nodes and therefor to right gpu's
-#c=fe
+c=fe
 MYMASKS="0x${c}000000000000,0x${c}00000000000000,0x${c}0000,0x${c}000000,0x${c},0x${c}00,0x${c}00000000,0x${c}0000000000"
 
 echo "START: $(date)"