diff --git a/README.md b/README.md index 1bd454d4..ece916a7 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,47 @@ We provide generations for the models described in the paper for both OA and Vic Can you distinguish ChatGPT from Guanaco? Give it a try! You can access [the model response Colab here](https://colab.research.google.com/drive/1kK6xasHiav9nhiRUJjPMZb4fAED4qRHb?usp=sharing) comparing ChatGPT and Guanaco 65B on Vicuna prompts. +## Checkpoint export (`export_hf_checkpoint.py`) + +These files contain scripts that merge the LoRA weights back into the base model +for export to Hugging Face format . +- Example: +```bash +$cd /mnt/e/PycharmProjects/qlora +$export BASE_MODEL=huggyllama/llama-30b +$export LORA_MODEL=/mnt/e/PycharmProjects/qlora/output/guanaco-33b/checkpoint-1500/adapter_model +$export HF_CHECKPOINT=/mnt/e/PycharmProjects/qlora/output/guanaco-33b/hf + +$python export_hf_checkpoint.py +CUDA SETUP: CUDA runtime path found: /home/hzx/.conda/envs/qlora/lib/libcudart.so.11.0 +CUDA SETUP: Highest compute capability among GPUs detected: 8.9 +CUDA SETUP: Detected CUDA version 118 +CUDA SETUP: Loading binary /home/hzx/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so... +Loading checkpoint shards: 100%|█████████████████████████████████████| 7/7 [00:02<00:00, 3.28it/s]total 63533373 + + +``` +- If you need to run it directly, you need to copy the three files special_tokens_map.json, tokenizer.model and tokenizer_config.json from the checkpoint-xxxx directory to the /mnt/e/PycharmProjects/qlora/output/guanaco-33b/hf directory to run + +- Final Results +```bash +$ ls -lrt /mnt/e/PycharmProjects/qlora/output/guanaco-33b/hf +total 63533869 +-rwxrwxrwx 1 hzx hzx 727 Jun 4 17:26 tokenizer_config.json +-rwxrwxrwx 1 hzx hzx 96 Jun 4 17:26 special_tokens_map.json +-rwxrwxrwx 1 hzx hzx 499723 Jun 4 17:26 tokenizer.model +-rwxrwxrwx 1 hzx hzx 607 Jun 5 22:45 config.json +-rwxrwxrwx 1 hzx hzx 137 Jun 5 22:45 generation_config.json +-rwxrwxrwx 1 hzx hzx 9818324627 Jun 5 22:45 pytorch_model-00001-of-00007.bin +-rwxrwxrwx 1 hzx hzx 9869497721 Jun 5 22:46 pytorch_model-00002-of-00007.bin +-rwxrwxrwx 1 hzx hzx 9896734097 Jun 5 22:46 pytorch_model-00003-of-00007.bin +-rwxrwxrwx 1 hzx hzx 9719524707 Jun 5 22:46 pytorch_model-00004-of-00007.bin +-rwxrwxrwx 1 hzx hzx 9869470481 Jun 5 22:46 pytorch_model-00005-of-00007.bin +-rwxrwxrwx 1 hzx hzx 9869470445 Jun 5 22:47 pytorch_model-00006-of-00007.bin +-rwxrwxrwx 1 hzx hzx 6015086981 Jun 5 22:47 pytorch_model-00007-of-00007.bin +-rwxrwxrwx 1 hzx hzx 50084 Jun 5 22:47 pytorch_model.bin.index.json +``` + ## Evaluation We include scripts adapted from the FastChat repo to automatically evaluate model generations using GPT-4. We include script for comparisons relative to ChatGPT with scores out of 10 as well as "pairwise comparisons" with three class labeling (win, loose, or tie). These are found in the `eval` folder. diff --git a/export_hf_checkpoint.py b/export_hf_checkpoint.py new file mode 100644 index 00000000..fe34061c --- /dev/null +++ b/export_hf_checkpoint.py @@ -0,0 +1,62 @@ +import os +import torch +from peft import PeftModel +from transformers import LlamaForCausalLM, LlamaTokenizer +''' +Example: + cd /mnt/e/PycharmProjects/qlora + export BASE_MODEL=/mnt/e/PycharmProjects/qlora/scripts/llama-30b + export LORA_MODEL=/mnt/e/PycharmProjects/qlora/output/guanaco-33b/checkpoint-1500/adapter_model + export HF_CHECKPOINT=/mnt/e/PycharmProjects/qlora/output/guanaco-33b/hf + python export_hf_checkpoint.py + ls -lrt /mnt/e/PycharmProjects/qlora/output/guanaco-33b/hf +''' +BASE_MODEL = os.environ.get("BASE_MODEL", "huggyllama/llama-7b") +LORA_MODEL = os.environ.get("LORA_MODEL", "/mnt/e/PycharmProjects/qlora/output/guanaco-33b/checkpoint-1500/adapter_model") +HF_CHECKPOINT = os.environ.get("HF_CHECKPOINT", "/mnt/e/PycharmProjects/qlora/output/guanaco-33b/hf") +DEVICE = os.environ.get("DEVICE", "cpu") + +assert ( + BASE_MODEL +), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=huggyllama/llama-7b`" # noqa: E501 + +tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL) + +base_model = LlamaForCausalLM.from_pretrained( + BASE_MODEL, + load_in_8bit=False, + torch_dtype=torch.float16, + device_map={"": DEVICE}, +) + +first_weight = base_model.model.layers[0].self_attn.q_proj.weight +first_weight_old = first_weight.clone() + +lora_model = PeftModel.from_pretrained( + base_model, + LORA_MODEL, + device_map={"": DEVICE}, + torch_dtype=torch.float16, +) + +lora_weight = lora_model.base_model.model.model.layers[0].self_attn.q_proj.weight + +assert torch.allclose(first_weight_old, first_weight) + +lora_model = lora_model.merge_and_unload() + +lora_model.train(False) + +assert not torch.allclose(first_weight_old, first_weight) + +lora_model_sd = lora_model.state_dict() + +deloreanized_sd = { + k.replace("base_model.model.", ""): v + for k, v in lora_model_sd.items() + if "lora" not in k +} + +LlamaForCausalLM.save_pretrained( + base_model, HF_CHECKPOINT, state_dict=deloreanized_sd, max_shard_size="9900MB" +) \ No newline at end of file