-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrun-allm-punc.sh
78 lines (64 loc) · 2.2 KB
/
run-allm-punc.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# Multi-nodes are also supported
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=eth1
export NCCL_IB_GID_INDEX=3
export NCCL_IB_SL=3
export NCCL_NET_GDR_READ=1
export MASTER_ADDR="${CHIEF_IP:=localhost}"
export MASTER_PORT="${MASTER_PORT:=29500}"
rootpath=./
export HF_HOME=$rootpath/hf_cache
export TRANSFORMERS_CACHE=$rootpath/hf_cache
train_path=$rootpath/jianhuipang/gogollm/newmodels/run_allms.py
model_path=$rootpath/jianhuipang/opensourcellms/llama2/Llama-2-7b-hf
model_path=$rootpath/jianhuipang/opensourcellms/llama2/Llama-2-13b-hf
# deepspeedpath=$rootpath/jianhuipang/LLMs4MT/train/deepspeed_config_zero2.json
deepspeedpath=$rootpath/jianhuipang/llama2_sft/train/deepspeed_config_bf16.json
datanamepath=$rootpath/jianhuipang/datasets/RedPajama-Data-1T-Sample
echo $datanamepath
model_save=$rootpath/models/checkpoints_ct/punc/allm-juhao-13b
if [ ! -d "$model_save" ]; then
mkdir -p "$model_save"
fi
# --training_data_num_lines $size \
# HOST_NUM will be 1
HOST_NUM=4
torchrun --nnodes $HOST_NUM --node_rank $INDEX --nproc_per_node 8 \
--master_addr $MASTER_ADDR --master_port $MASTER_PORT \
${train_path} \
--deepspeed $deepspeedpath \
--anchor_symbols "." \
--model_name_or_path ${model_path} \
--dataset_name $datanamepath \
--preprocessing_num_workers 32 \
--dataloader_num_workers 8 \
--dataloader_pin_memory True \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--num_train_epochs 1 \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_steps 20 \
--lr_scheduler_type "constant_with_warmup" \
--logging_steps 1 \
--block_size 4096 \
--do_train \
--evaluation_strategy "no" \
--validation_split_percentage 1 \
--bf16 True \
--bf16_full_eval True \
--ddp_timeout 72000 \
--seed 34 \
--overwrite_output_dir \
--gradient_checkpointing True \
--output_dir ${model_save} \
2>&1 | tee -a ${model_save}/log.txt
# Use streaming for large datasets and specify the max_steps
# --streaming \
# --max_steps 2500 \
# done