diff --git a/README.md b/README.md index 88672c4..6769d33 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ Ground truth flows are labeled using the netflow labeler. so each flow has a lab # Limitations -* the labels in ground truth zeek dir have to be 'Malicious' or 'Benign' only. if any other label is present this tool will consider it "Benign" +* the labels in ground truth zeek dir have to be 'Malicious' or 'Benign' only. if any other label is present this tool will completely discard the flow. * ground truth dirs can either be json or tab separated zeek dir or conn.log file * all paths given as parameters to this tool must be absolute paths. diff --git a/parsers/ground_truth.py b/parsers/ground_truth.py index e04de5c..40effe3 100644 --- a/parsers/ground_truth.py +++ b/parsers/ground_truth.py @@ -4,9 +4,9 @@ import utils.timestamp_handler from typing import ( Tuple, - Dict, List, Optional, + Union, ) from re import findall from parsers.config import ConfigurationParser @@ -161,15 +161,13 @@ def extract_label_from_line(self, line:str) -> str: :return: malicious, benign or unknown """ pattern = r"Malicious[\s\t]+" - matches = findall(pattern, line) - if matches: + if findall(pattern, line): return 'malicious' pattern = r"Benign[\s\t]+" - matches = findall(pattern, line) - if matches: + if findall(pattern, line): return 'benign' - + return 'unknown' def update_labels_ctr(self, label: str): @@ -199,7 +197,7 @@ def handle_zeek_json(self, line:str) -> Tuple[str,str,str,str]: if not aid: return False - label = line.get('label', '') + label = line.get('label', '') self.update_labels_ctr(label) return label, aid, line['ts'], line['id.orig_h'] @@ -225,37 +223,50 @@ def handle_zeek_tabs(self, line:str) -> Optional[Tuple[str,str,str,str]]: # spaces so we can't use python's split() # using regex split, split line when you encounter more than 2 spaces # in a row - line = line.split('\t') if '\t' in line else split(r'\s{2,}', line) + line: List[str] = line.split('\t') if ( + '\t' in line) \ + else split(r'\s{2,'r'}', line) aid = self.handle_getting_aid(line) if not aid: return return label, aid, line[0], line[2] - - def extract_fields(self, line: str) -> Optional[dict]: + + def extract_fields(self, line: str) -> Tuple[Union[bool,dict], str]: """ extracts the label and community id from the given line - uses zeek_file_type to extract fields based on the type of the given zeek dir + uses zeek_file_type to extract fields based on the type of the given + zeek dir + completely ignores gt flows that have labels other than benign or + malicious :param line: line as read from the zeek log file - :return: returns a flow dict with {'aid': ..., 'label':...} + :return: + If it managed to extract the flow, returns the + extracted flow dict and no errors + If not, returns False and the error """ if self.zeek_file_type == 'json': - #TODO this wasn't tested before ok? flow = self.handle_zeek_json(line) elif self.zeek_file_type == 'tab-separated': flow = self.handle_zeek_tabs(line) - + + if not flow: + return False, "Invalid flow" + try: + if flow[0] == "unknown": + return False, f"Unsupported flow label '{flow[0]}'" + return { 'label': flow[0], 'aid': flow[1], 'timestamp': flow[2], 'srcip': flow[3], - } - except (IndexError, TypeError): + }, "" + except (IndexError, TypeError) as e: # one of the above 2 methods failed to parse the given line - return + return False, f"Problem extracting flow: {line} .. {e}" def register_timewindow(self, ts) -> dict: @@ -364,7 +375,10 @@ def label_tw(self, flow: dict, tw_registration_stats: dict): def parse_file(self, filename: str): """ - extracts the label and community id from each flow and stores them in the db + extracts the label and community id from each flow and stores them + in the db + Completely ignores flows that dont have benign or malicious in + their labels, e.g background flows :param filename: the name of the zeek logfile without the path, for example conn.log this can be the file given to this tool using -gtf or 1 file @@ -380,11 +394,10 @@ def parse_file(self, filename: str): if line.startswith('#'): continue - flow = self.extract_fields(line) + flow, err = self.extract_fields(line) if not flow: - self.log(f"Problem extracting flow " - f"from line number {line_number}: ", - line, + self.log(f"{err}. Skipping flow at line", + line_number, error=True) continue @@ -498,7 +511,8 @@ def parse(self): os._exit(0) except Exception as e: self.log("An error occurred: ", e, error=True) - self.log("",f"{traceback.format_exc()}", error=True) + self.log("",f"{traceback.format_exc()}", + error=True) os._exit(1)