Skip to content

Commit

Permalink
lib logger - WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
josemarcosrf committed Dec 14, 2020
1 parent 813a066 commit ce93c13
Show file tree
Hide file tree
Showing 26 changed files with 211 additions and 98 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ numpy
scikit-learn
scipy
nltk
coloredlogs
42 changes: 41 additions & 1 deletion sentence_transformers/LoggingHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,44 @@ def emit(self, record):
except (KeyboardInterrupt, SystemExit):
raise
except:
self.handleError(record)
self.handleError(record)


def install_logger(
given_logger, level = logging.WARNING, fmt="%(levelname)s:%(name)s:%(message)s"
):
""" Configures the given logger; format, logging level, style, etc """
import coloredlogs

def add_notice_log_level():
""" Creates a new 'notice' logging level """
# inspired by:
# https://stackoverflow.com/questions/2183233/how-to-add-a-custom-loglevel-to-pythons-logging-facility
NOTICE_LEVEL_NUM = 25
logging.addLevelName(NOTICE_LEVEL_NUM, "NOTICE")

def notice(self, message, *args, **kws):
if self.isEnabledFor(NOTICE_LEVEL_NUM):
self._log(NOTICE_LEVEL_NUM, message, args, **kws)

logging.Logger.notice = notice

# Add an extra logging level above INFO and below WARNING
add_notice_log_level()

# More style info at:
# https://coloredlogs.readthedocs.io/en/latest/api.html
field_styles = coloredlogs.DEFAULT_FIELD_STYLES.copy()
field_styles["asctime"] = {}
level_styles = coloredlogs.DEFAULT_LEVEL_STYLES.copy()
level_styles["debug"] = {"color": "white", "faint": True}
level_styles["notice"] = {"color": "cyan", "bold": True}

coloredlogs.install(
logger=given_logger,
level=level,
use_chroot=False,
fmt=fmt,
level_styles=level_styles,
field_styles=field_styles,
)
30 changes: 16 additions & 14 deletions sentence_transformers/SentenceTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from .models import Transformer, Pooling
from . import __version__

logger = logging.getLogger(__name__)

class SentenceTransformer(nn.Sequential):
"""
Loads or create a SentenceTransformer model, that can be used to map sentences / text to embeddings.
Expand All @@ -35,17 +37,17 @@ class SentenceTransformer(nn.Sequential):
"""
def __init__(self, model_name_or_path: str = None, modules: Iterable[nn.Module] = None, device: str = None):
if model_name_or_path is not None and model_name_or_path != "":
logging.info("Load pretrained SentenceTransformer: {}".format(model_name_or_path))
logger.info("Load pretrained SentenceTransformer: {}".format(model_name_or_path))
model_path = model_name_or_path

if not os.path.isdir(model_path) and not model_path.startswith('http://') and not model_path.startswith('https://'):
logging.info("Did not find folder {}".format(model_path))
logger.info("Did not find folder {}".format(model_path))

if '\\' in model_path or model_path.count('/') > 1:
raise AttributeError("Path {} not found".format(model_path))

model_path = __DOWNLOAD_SERVER__ + model_path + '.zip'
logging.info("Try to download model from server: {}".format(model_path))
logger.info("Try to download model from server: {}".format(model_path))

if model_path.startswith('http://') or model_path.startswith('https://'):
model_url = model_path
Expand All @@ -64,7 +66,7 @@ def __init__(self, model_name_or_path: str = None, modules: Iterable[nn.Module]
if not os.path.exists(model_path) or not os.listdir(model_path):
if model_url[-1] == "/":
model_url = model_url[:-1]
logging.info("Downloading sentence transformer model from {} and saving it at {}".format(model_url, model_path))
logger.info("Downloading sentence transformer model from {} and saving it at {}".format(model_url, model_path))

model_path_tmp = model_path.rstrip("/").rstrip("\\")+"_part"
try:
Expand All @@ -77,8 +79,8 @@ def __init__(self, model_name_or_path: str = None, modules: Iterable[nn.Module]
except requests.exceptions.HTTPError as e:
shutil.rmtree(model_path_tmp)
if e.response.status_code == 404:
logging.warning('SentenceTransformer-Model {} not found. Try to create it from scratch'.format(model_url))
logging.warning('Try to create Transformer Model {} with mean pooling'.format(model_name_or_path))
logger.warning('SentenceTransformer-Model {} not found. Try to create it from scratch'.format(model_url))
logger.warning('Try to create Transformer Model {} with mean pooling'.format(model_name_or_path))

model_path = None
transformer_model = Transformer(model_name_or_path)
Expand All @@ -94,13 +96,13 @@ def __init__(self, model_name_or_path: str = None, modules: Iterable[nn.Module]

#### Load from disk
if model_path is not None:
logging.info("Load SentenceTransformer from folder: {}".format(model_path))
logger.info("Load SentenceTransformer from folder: {}".format(model_path))

if os.path.exists(os.path.join(model_path, 'config.json')):
with open(os.path.join(model_path, 'config.json')) as fIn:
config = json.load(fIn)
if config['__version__'] > __version__:
logging.warning("You try to use a model that was created with version {}, however, your version is {}. This might cause unexpected behavior or errors. In that case, try to update to the latest version.\n\n\n".format(config['__version__'], __version__))
logger.warning("You try to use a model that was created with version {}, however, your version is {}. This might cause unexpected behavior or errors. In that case, try to update to the latest version.\n\n\n".format(config['__version__'], __version__))

with open(os.path.join(model_path, 'modules.json')) as fIn:
contained_modules = json.load(fIn)
Expand All @@ -118,7 +120,7 @@ def __init__(self, model_name_or_path: str = None, modules: Iterable[nn.Module]
super().__init__(modules)
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.info("Use pytorch device: {}".format(device))
logger.info("Use pytorch device: {}".format(device))

self._target_device = torch.device(device)

Expand Down Expand Up @@ -222,10 +224,10 @@ def start_multi_process_pool(self, target_devices: List[str] = None, encode_batc
if torch.cuda.is_available():
target_devices = ['cuda:{}'.format(i) for i in range(torch.cuda.device_count())]
else:
logging.info("CUDA is not available. Start 4 CPU worker")
logger.info("CUDA is not available. Start 4 CPU worker")
target_devices = ['cpu']*4

logging.info("Start multi-process pool on devices: {}".format(', '.join(map(str, target_devices))))
logger.info("Start multi-process pool on devices: {}".format(', '.join(map(str, target_devices))))

ctx = mp.get_context('spawn')
input_queue = ctx.Queue()
Expand Down Expand Up @@ -272,7 +274,7 @@ def encode_multi_process(self, sentences: List[str], pool: Dict[str, object], is
if chunk_size is None:
chunk_size = min(math.ceil(len(sentences) / len(pool["processes"]) / 10), 5000)

logging.info("Chunk data into packages of size {}".format(chunk_size))
logger.info("Chunk data into packages of size {}".format(chunk_size))

input_queue = pool['input']
last_chunk_id = 0
Expand Down Expand Up @@ -350,7 +352,7 @@ def save(self, path):

os.makedirs(path, exist_ok=True)

logging.info("Save model to {}".format(path))
logger.info("Save model to {}".format(path))
contained_modules = []

for idx, name in enumerate(self._modules):
Expand Down Expand Up @@ -560,7 +562,7 @@ def fit(self,
try:
data = next(data_iterator)
except StopIteration:
#logging.info("Restart data_iterator")
#logger.info("Restart data_iterator")
data_iterator = iter(dataloaders[train_idx])
data_iterators[train_idx] = data_iterator
data = next(data_iterator)
Expand Down
8 changes: 7 additions & 1 deletion sentence_transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@

__DOWNLOAD_SERVER__ = 'https://sbert.net/models/'

import os
import logging

from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset
from .LoggingHandler import LoggingHandler
from .LoggingHandler import install_logger
from .SentenceTransformer import SentenceTransformer
from .readers import InputExample
from .cross_encoder.CrossEncoder import CrossEncoder


logger = logging.getLogger(__name__)


# configure the library logger from which all other loggers will inherit
install_logger(logger, level=os.environ.get("ST_LOG_LEVEL", logging.WARNING))



7 changes: 5 additions & 2 deletions sentence_transformers/cross_encoder/CrossEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
from ..evaluation import SentenceEvaluator


logger = logging.getLogger(__name__)


class CrossEncoder():
def __init__(self, model_name:str, num_labels:int = None, max_length:int = None, device:str = None, use_fast_tokenizer:bool = None):
"""
Expand Down Expand Up @@ -52,7 +55,7 @@ def __init__(self, model_name:str, num_labels:int = None, max_length:int = None,

if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.info("Use pytorch device: {}".format(device))
logger.info("Use pytorch device: {}".format(device))

self._target_device = torch.device(device)

Expand Down Expand Up @@ -298,7 +301,7 @@ def save(self, path):
if path is None:
return

logging.info("Save model to {}".format(path))
logger.info("Save model to {}".format(path))
self.model.save_pretrained(path)
self.tokenizer.save_pretrained(path)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from ... import InputExample
from ...evaluation import BinaryClassificationEvaluator


logger = logging.getLogger(__name__)

class CEBinaryClassificationEvaluator:
"""
This evaluator can be used with the CrossEncoder class. Given sentence pairs and binary labels (0 and 1),
Expand Down Expand Up @@ -44,18 +47,18 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =
else:
out_txt = ":"

logging.info("CEBinaryClassificationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
logger.info("CEBinaryClassificationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)

acc, acc_threshold = BinaryClassificationEvaluator.find_best_acc_and_threshold(pred_scores, self.labels, True)
f1, precision, recall, f1_threshold = BinaryClassificationEvaluator.find_best_f1_and_threshold(pred_scores, self.labels, True)
ap = average_precision_score(self.labels, pred_scores)

logging.info("Accuracy: {:.2f}\t(Threshold: {:.4f})".format(acc * 100, acc_threshold))
logging.info("F1: {:.2f}\t(Threshold: {:.4f})".format(f1 * 100, f1_threshold))
logging.info("Precision: {:.2f}".format(precision * 100))
logging.info("Recall: {:.2f}".format(recall * 100))
logging.info("Average Precision: {:.2f}\n".format(ap * 100))
logger.info("Accuracy: {:.2f}\t(Threshold: {:.4f})".format(acc * 100, acc_threshold))
logger.info("F1: {:.2f}\t(Threshold: {:.4f})".format(f1 * 100, f1_threshold))
logger.info("Precision: {:.2f}".format(precision * 100))
logger.info("Recall: {:.2f}".format(recall * 100))
logger.info("Average Precision: {:.2f}\n".format(ap * 100))

if output_path is not None:
csv_path = os.path.join(output_path, self.csv_file)
Expand All @@ -68,4 +71,4 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =
writer.writerow([epoch, steps, acc, acc_threshold, f1, f1_threshold, precision, recall, ap])


return ap
return ap
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import csv
from ... import InputExample


logger = logging.getLogger(__name__)

class CECorrelationEvaluator:
"""
This evaluator can be used with the CrossEncoder class. Given sentence pairs and continuous scores,
Expand Down Expand Up @@ -38,14 +41,14 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =
else:
out_txt = ":"

logging.info("CECorrelationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
logger.info("CECorrelationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)


eval_pearson, _ = pearsonr(self.scores, pred_scores)
eval_spearman, _ = spearmanr(self.scores, pred_scores)

logging.info("Correlation:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson, eval_spearman))
logger.info("Correlation:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson, eval_spearman))

if output_path is not None:
csv_path = os.path.join(output_path, self.csv_file)
Expand All @@ -57,4 +60,4 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =

writer.writerow([epoch, steps, eval_pearson, eval_spearman])

return eval_spearman
return eval_spearman
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from ... import InputExample
import numpy as np


logger = logging.getLogger(__name__)

class CESoftmaxAccuracyEvaluator:
"""
This evaluator can be used with the CrossEncoder class.
Expand Down Expand Up @@ -39,15 +42,15 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =
else:
out_txt = ":"

logging.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
logger.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
pred_labels = np.argmax(pred_scores, axis=1)

assert len(pred_labels) == len(self.labels)

acc = np.sum(pred_labels == self.labels) / len(self.labels)

logging.info("Accuracy: {:.2f}".format(acc*100))
logger.info("Accuracy: {:.2f}".format(acc*100))

if output_path is not None:
csv_path = os.path.join(output_path, self.csv_file)
Expand All @@ -59,4 +62,4 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =

writer.writerow([epoch, steps, acc])

return acc
return acc
5 changes: 4 additions & 1 deletion sentence_transformers/datasets/ParallelSentencesDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from typing import List
import random


logger = logging.getLogger(__name__)

class ParallelSentencesDataset(Dataset):
"""
This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
Expand Down Expand Up @@ -54,7 +57,7 @@ def load_data(self, filepath: str, weight: int = 100, max_sentences: int = None,
:return:
"""

logging.info("Load "+filepath)
logger.info("Load "+filepath)
parallel_sentences = []

with gzip.open(filepath, 'rt', encoding='utf8') if filepath.endswith('.gz') else open(filepath, encoding='utf8') as fIn:
Expand Down
Loading

0 comments on commit ce93c13

Please sign in to comment.