Merge main

deepspeedai · Feb 7, 2025 · 73a7b40 · 73a7b40
2 parents 360712f + f4157be
commit 73a7b40
Show file tree

Hide file tree

Showing 38 changed files with 51,560 additions and 251 deletions.
diff --git a/examples_deepspeed/finetune_hf_llama/README.md b/examples_deepspeed/finetune_hf_llama/README.md
@@ -10,9 +10,9 @@ The pre-trained weights can be found at [Hugging Face - LLAMA-7B](https://huggin
 
 #### 1. Converting Hugging Face Model Weights to Megatron-Deepspeed Model
 ```bash
-bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh convert
+bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh convert_hf2mds
 ```
-This command writes the Hugging Face model weights into the Megatron-Deepspeed model and saves it. You can adjust the parallel configuration in the script.
+This command writes the Hugging Face model weights into the Megatron-Deepspeed model and saves it. You can adjust the parallel configuration in the script.```convert_mds2hf``` can convert a Megatron-Deepspeed model into the Hugging Face format
 
 #### 2. Fine-tuning Process
 ```bash

diff --git a/examples_deepspeed/finetune_hf_llama/ds_config_empty.json b/examples_deepspeed/finetune_hf_llama/ds_config_empty.json
@@ -0,0 +1,5 @@
+{
+  "train_batch_size" : 256,
+  "train_micro_batch_size_per_gpu": 16,
+  "steps_per_print": 100
+}
diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
@@ -1,8 +1,8 @@
 DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json
-DATASET_PATH=./alpaca_data.json
+DATASET_PATH=./examples_deepspeed/finetune_hf_llama/alpaca_data.json
 # dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json
 
-HF_LLAMA_PATH=/data/llama-7b/
+HF_LLAMA_PATH=/data/llama-2-7b-hf/
 # weights link: https://huggingface.co/huggyllama/llama-7b
 
 MICRO_BATCH_SIZE=16
@@ -43,12 +43,28 @@ cat <<EOT > $DS_CONFIG
 }
 EOT
 
+if [ "$1" = "convert_hf2mds" ]; then
+    DS_CONFIG_PATH="./examples_deepspeed/finetune_hf_llama/ds_config_empty.json"
+elif [ "$1" = "convert_mds2hf" ]; then
+    DS_CONFIG_PATH="./examples_deepspeed/finetune_hf_llama/ds_config_empty.json"
+else
+    DS_CONFIG_PATH="./examples_deepspeed/finetune_hf_llama/ds_config.json"
+fi
 
-covert_args="deepspeed tools/hf2megads_weight_converter.py \
+covert_hf2mds_args="deepspeed tools/hf2megads_weight_converter.py \
 --hf-ckpt-num-shards 2 \
---origin-hf-ckpt-dir $HF_LLAMA_PATH \
+--hf-ckpt-dir $HF_LLAMA_PATH \
+--load-mode auto \
 --save $MEGA_DS_LLAMA_PATH"
 
+covert_mds2hf_args="deepspeed tools/hf2megads_weight_converter.py \
+--hf-ckpt-num-shards 2 \
+--hf-ckpt-dir $HF_LLAMA_PATH \
+--load-mode auto \
+--to-hf-ckpt \
+--load $MEGA_DS_LLAMA_PATH \
+--save $HF_LLAMA_PATH'-hf-out' "
+
 finetune_args="deepspeed finetune_llama.py \
 --load $MEGA_DS_LLAMA_PATH"
 
@@ -60,6 +76,7 @@ comm_args="--tensor-model-parallel-size $TP \
 --num-layers $NUM_LAYERS \
 --hidden-size $HIDDEN_SIZE \
 --num-attention-heads $NUM_HEADS \
+--finetune \
 --ffn-hidden-size $FFN_HIDDEN_SIZE \
 --attention-dropout 0 \
 --hidden-dropout 0 \
@@ -88,7 +105,7 @@ comm_args="--tensor-model-parallel-size $TP \
 --zero-stage 0 \
 --tokenizer-type HFTokenizer \
 --tokenizer-model $HF_LLAMA_PATH \
---deepspeed_config ./examples_deepspeed/finetune_hf_llama/ds_config.json \
+--deepspeed_config $DS_CONFIG_PATH \
 --deepspeed \
 --distributed-backend nccl \
 --num-workers 0 \
@@ -98,8 +115,10 @@ comm_args="--tensor-model-parallel-size $TP \
 --no-gradient-accumulation-fusion \
 --repeated-dataloader"
 
-if [ "$1" = "convert" ]; then
-    task_args="$covert_args"
+if [ "$1" = "convert_hf2mds" ]; then
+    task_args="$covert_hf2mds_args"
+elif [ "$1" = "convert_mds2hf" ]; then
+    task_args="$covert_mds2hf_args"
 else
     task_args="$finetune_args"
 fi