Skip to content

Commit

Permalink
final push before closing 12gb training
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Apr 1, 2022
1 parent c9f14a1 commit 6af1377
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 110 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ __pycache__/
#Data
Data/
Preprocessing/size_count.txt
checkpoints/
*.zip

# Distribution / packaging
.Python
Expand Down Expand Up @@ -162,4 +164,4 @@ dataset/dfq/pdfs
dataset/fas_doddir_usaf/pdfs
dataset/fas_eprint/pdfs
dataset/military_newbie_manuals/pdfs
**/.DS_Store
**/.DS_Store
55 changes: 50 additions & 5 deletions MLM_deepspeed.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
"import os, time, json, datetime, pytz\n",
"from tqdm.auto import tqdm\n",
"from typing import List, Union, Dict\n",
"from re_sent_splitter import split_into_sentences\n",
"from pathlib import Path\n",
"import pathlib\n",
"from multiprocessing import Pool\n",
Expand All @@ -28,7 +27,6 @@
"from transformers.deepspeed import HfDeepSpeedConfig\n",
"\n",
"#tokenizers and datasets\n",
"from datasets import load_dataset\n",
"from tokenizers import BertWordPieceTokenizer \n",
"from tokenizers.processors import TemplateProcessing\n",
"import tokenizers\n",
Expand All @@ -41,16 +39,43 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 19,
"id": "5d1901ab-dc83-4f62-8060-913b1a7c7371",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2022-03-31_05:16:14'"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"str(datetime.datetime.now()).split('.')[0].replace(' ','_')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3b42db8b-50b0-4f36-88eb-a52b4d3d8880",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_CudaDeviceProperties(name='Tesla V100-PCIE-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-PCIE-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)\n"
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
"_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n"
]
}
],
Expand All @@ -59,6 +84,26 @@
" print(torch.cuda.get_device_properties(d))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "0eeb50ee-2dec-44af-a879-f956817fbb01",
"metadata": {},
"outputs": [],
"source": [
"inputs = torch.load('Data/encodings/encodings_395554_combined4Gb_2.txt.pt')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "fd010bb2-c611-4bcd-8fce-ea2eeee5c8f5",
"metadata": {},
"outputs": [],
"source": [
"lengths = list(map(len, inputs['labels']))"
]
},
{
"cell_type": "markdown",
"id": "8c712fb9-5a4f-4dc8-907e-3a4abd1f2b57",
Expand Down
37 changes: 25 additions & 12 deletions Modeling/MLM_Bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@

logger = loguru.logger

for gpu in nvgpu.gpu_info():
logger.info(gpu)
# for gpu in nvgpu.gpu_info():
# logger.info(gpu)

local_rank = 0
device = (
Expand All @@ -36,10 +36,10 @@
else torch.device("cpu")
)

# encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/'
# files = [f for f in os.listdir(encodings_path) if f.endswith('pt')]
encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/test/'
files = [f for f in os.listdir(encodings_path) if not f.endswith('test')]
encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/'
files = [f for f in os.listdir(encodings_path) if f.endswith('pt')]
# encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/test/'
# files = [f for f in os.listdir(encodings_path) if f.endswith('pt')][:2]
logger.info(files)

############################################################
Expand All @@ -60,7 +60,7 @@ def __getitem__(self, i):
def assemble(file_path: str):
encodings = torch.load(file_path)
dataset = Dataset(encodings)
loader = torch.utils.data.DataLoader(d, batch_size=BATCH_SIZE, num_workers=6, pin_memory=True, shuffle=True)
loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=6, pin_memory=True, shuffle=True)
del encodings
return loader

Expand All @@ -73,6 +73,7 @@ def assemble(file_path: str):
model = DataParallel(model)
model.to(device)
logger.info('Model creation completed')
logger.info(model.device_ids)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

Expand All @@ -81,7 +82,7 @@ def assemble(file_path: str):
############################################################
model.train()

epochs = 2
epochs = 50
overall_steps = 0
mean_losses = []

Expand All @@ -91,12 +92,16 @@ def assemble(file_path: str):

data_loader = assemble(os.path.join(encodings_path, file))
num_batches = len(data_loader)
logger.info(f'Total batches for this load: {num_batches}')

loss_check = floor(num_batches/10)
steps = 0
for batch in tqdm(data_loader, f'Epoch: {epoch}'):
steps += 1

# initialize calculated gradients (from prev step)
optimizer.zero_grad()

# pull all tensor batches required for training
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
Expand All @@ -113,16 +118,24 @@ def assemble(file_path: str):
# update parameters
optimizer.step()

if step % loss_check == 0:
if steps % loss_check == 0:
logger.info(f'Loss: {loss.sum()}')
mean_losses.append(loss.sum())
steps_loss = loss.sum().detach().cpu()
mean_losses.append(steps_loss.numpy())
with open('./checkpoints/run_12GB_losses.txt', 'a') as f:
f.write(loss.sum())
f.write(f'{steps_loss}')
f.write('\n')

overall_steps += steps
logger.info('Deleting dataloader from memory')
del data_loader

logger.info(f'Average Loss for Epoch: {np.round(np.mean(mean_losses), 3)}')

with open('./checkpoints/run_12GB_epoch_losses.txt', 'a') as f:
f.write(f'{np.round(np.mean(mean_losses), 3)}')
f.write('\n')

mean_losses = []
timestamp = str(datetime.datetime.now()).split('.')[0].replace(' ','_')
model.module.save_pretrained(f'checkpoints/run_12GB_{timestamp}/model-trained-{epoch}-{overall_steps}.pt')
model.module.save_pretrained(f'checkpoints/run_12GB_{timestamp}/model-trained-{epoch}-{overall_steps}')
112 changes: 20 additions & 92 deletions Modeling/loss_plotting.ipynb

Large diffs are not rendered by default.

0 comments on commit 6af1377

Please sign in to comment.