From cf91ad2711632c7b11abb33ef235a81dfb381189 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Thu, 19 Oct 2023 18:31:13 -0500
Subject: [PATCH] Wrap long command line arguments in redpajama docs (#655)

---
 tutorials/pretrain_redpajama.md | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/tutorials/pretrain_redpajama.md b/tutorials/pretrain_redpajama.md
index 700f5ec..3b03f6d 100644
--- a/tutorials/pretrain_redpajama.md
+++ b/tutorials/pretrain_redpajama.md
@@ -40,7 +40,8 @@ git clone https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T dat
 
 ```bash
 # The 1 billion token subset
-git clone https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample data/RedPajama-Data-1T-Sample
+git clone https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \
+  data/RedPajama-Data-1T-Sample
 ```
 
 ## Prepare RedPajama for training
@@ -52,19 +53,28 @@ streaming dataset that comes with lit-gpt. You will need to have the tokenizer c
 ```bash
 pip install huggingface_hub sentencepiece
 
-python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf --access_token your_hf_token
+python scripts/download.py \
+   --repo_id meta-llama/Llama-2-7b-chat-hf \
+   --access_token your_hf_token
 ```
 
 Then, run
 
 ```bash
-python scripts/prepare_redpajama.py --source_path data/RedPajama-Data-1T --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-hf/ --destination_path data/lit-redpajama
+python scripts/prepare_redpajama.py \
+  --source_path data/RedPajama-Data-1T \
+  --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-hf/ \
+  --destination_path data/lit-redpajama
 ```
 
 or
 
 ```bash
-python scripts/prepare_redpajama.py --source_path data/RedPajama-Data-1T-Sample --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-hf/ --destination_path data/lit-redpajama-sample --sample True
+python scripts/prepare_redpajama.py \
+  --source_path data/RedPajama-Data-1T-Sample \
+  --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-hf/ \
+  --destination_path data/lit-redpajama-sample \
+  --sample True
 ```
 
 for the sample dataset.
@@ -78,13 +88,17 @@ The script will take a while to run, so time for :tea: (The 1B sample script tak
 Running the pretraining script with its default settings requires at least 4 GPUs with 40GB+ each (A100).
 
 ```bash
-python pretrain/redpajama.py --devices 4 --train_data_dir data/lit-redpajama
+python pretrain/redpajama.py \
+  --devices 4 \
+  --train_data_dir data/lit-redpajama
 ```
 
 For running on the sample dataset:
 
 ```bash
-python pretrain/redpajama.py --devices 4 --train_data_dir data/lit-redpajama-sample
+python pretrain/redpajama.py \
+  --devices 4 \
+  --train_data_dir data/lit-redpajama-sample
 ```
 
 The script will save checkpoints periodically to the folder `out/`.