final push before closing 12gb training

JasonZhangzy1757 · Apr 1, 2022 · 6af1377 · 6af1377
1 parent c9f14a1
commit 6af1377
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 110 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,8 @@ __pycache__/
 #Data
 Data/
 Preprocessing/size_count.txt
+checkpoints/
+*.zip
 
 # Distribution / packaging
 .Python
@@ -162,4 +164,4 @@ dataset/dfq/pdfs
 dataset/fas_doddir_usaf/pdfs
 dataset/fas_eprint/pdfs
 dataset/military_newbie_manuals/pdfs
-**/.DS_Store
+**/.DS_Store
diff --git a/MLM_deepspeed.ipynb b/MLM_deepspeed.ipynb
@@ -11,7 +11,6 @@
     "import os, time, json, datetime, pytz\n",
     "from tqdm.auto import tqdm\n",
     "from typing import List, Union, Dict\n",
-    "from re_sent_splitter import split_into_sentences\n",
     "from pathlib import Path\n",
     "import pathlib\n",
     "from multiprocessing import Pool\n",
@@ -28,7 +27,6 @@
     "from transformers.deepspeed import HfDeepSpeedConfig\n",
     "\n",
     "#tokenizers and datasets\n",
-    "from datasets import load_dataset\n",
     "from tokenizers import BertWordPieceTokenizer \n",
     "from tokenizers.processors import TemplateProcessing\n",
     "import tokenizers\n",
@@ -41,16 +39,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 19,
+   "id": "5d1901ab-dc83-4f62-8060-913b1a7c7371",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2022-03-31_05:16:14'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "str(datetime.datetime.now()).split('.')[0].replace(' ','_')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "id": "3b42db8b-50b0-4f36-88eb-a52b4d3d8880",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "_CudaDeviceProperties(name='Tesla V100-PCIE-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)\n",
-      "_CudaDeviceProperties(name='Tesla V100-PCIE-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)\n"
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n",
+      "_CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32510MB, multi_processor_count=80)\n"
      ]
     }
    ],
@@ -59,6 +84,26 @@
     "    print(torch.cuda.get_device_properties(d))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0eeb50ee-2dec-44af-a879-f956817fbb01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = torch.load('Data/encodings/encodings_395554_combined4Gb_2.txt.pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "fd010bb2-c611-4bcd-8fce-ea2eeee5c8f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lengths = list(map(len, inputs['labels']))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8c712fb9-5a4f-4dc8-907e-3a4abd1f2b57",

diff --git a/Modeling/MLM_Bert.py b/Modeling/MLM_Bert.py
@@ -26,8 +26,8 @@
 
 logger = loguru.logger
 
-for gpu in nvgpu.gpu_info():
-    logger.info(gpu)
+# for gpu in nvgpu.gpu_info():
+#     logger.info(gpu)
 
 local_rank = 0
 device = (
@@ -36,10 +36,10 @@
         else torch.device("cpu")
     )
 
-# encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/'
-# files = [f for f in os.listdir(encodings_path) if f.endswith('pt')]
-encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/test/'
-files = [f for f in os.listdir(encodings_path) if not f.endswith('test')]
+encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/'
+files = [f for f in os.listdir(encodings_path) if f.endswith('pt')]
+# encodings_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/encodings/test/'
+# files = [f for f in os.listdir(encodings_path) if f.endswith('pt')][:2]
 logger.info(files)
 
 ############################################################
@@ -60,7 +60,7 @@ def __getitem__(self, i):
 def assemble(file_path: str):
     encodings = torch.load(file_path)
     dataset = Dataset(encodings)
-    loader = torch.utils.data.DataLoader(d, batch_size=BATCH_SIZE, num_workers=6, pin_memory=True, shuffle=True)
+    loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=6, pin_memory=True, shuffle=True)
     del encodings
     return loader
 
@@ -73,6 +73,7 @@ def assemble(file_path: str):
 model = DataParallel(model)
 model.to(device)
 logger.info('Model creation completed')
+logger.info(model.device_ids)
 
 optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
 
@@ -81,7 +82,7 @@ def assemble(file_path: str):
 ############################################################
 model.train()
 
-epochs = 2
+epochs = 50
 overall_steps = 0
 mean_losses = []
 
@@ -91,12 +92,16 @@ def assemble(file_path: str):
 
         data_loader = assemble(os.path.join(encodings_path, file))
         num_batches = len(data_loader)
+        logger.info(f'Total batches for this load: {num_batches}')
+
         loss_check = floor(num_batches/10)
         steps = 0
         for batch in tqdm(data_loader, f'Epoch: {epoch}'):
             steps += 1           
+
             # initialize calculated gradients (from prev step)
             optimizer.zero_grad()
+
             # pull all tensor batches required for training
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
@@ -113,16 +118,24 @@ def assemble(file_path: str):
             # update parameters
             optimizer.step()
 
-            if step % loss_check == 0:
+            if steps % loss_check == 0:
                 logger.info(f'Loss: {loss.sum()}')
-                mean_losses.append(loss.sum())
+                steps_loss = loss.sum().detach().cpu()
+                mean_losses.append(steps_loss.numpy())
                 with open('./checkpoints/run_12GB_losses.txt', 'a') as f:
-                    f.write(loss.sum())
+                    f.write(f'{steps_loss}')
                     f.write('\n')
 
         overall_steps += steps
+        logger.info('Deleting dataloader from memory')
         del data_loader
 
     logger.info(f'Average Loss for Epoch: {np.round(np.mean(mean_losses), 3)}')
+
+    with open('./checkpoints/run_12GB_epoch_losses.txt', 'a') as f:
+        f.write(f'{np.round(np.mean(mean_losses), 3)}')
+        f.write('\n')
+
+    mean_losses = []
     timestamp = str(datetime.datetime.now()).split('.')[0].replace(' ','_')
-    model.module.save_pretrained(f'checkpoints/run_12GB_{timestamp}/model-trained-{epoch}-{overall_steps}.pt') 
+    model.module.save_pretrained(f'checkpoints/run_12GB_{timestamp}/model-trained-{epoch}-{overall_steps}') 
diff --git a/Modeling/loss_plotting.ipynb b/Modeling/loss_plotting.ipynb