""" This file looks into the logs to determine if there is any intrusion or provides means to assess logs """ import pandas as pd import numpy as np # import transport import datetime import io import json import re from datetime import datetime _date = "(^[A-Z][a-z]{2}) ([0-9]{2}) ([0-9]{2})\:([0-9]){2}\:([0-9]{2})" _ip = "\d+\.\d+\.\d+\.\d+" _regex = { 'login':{'pattern':f'{_date} .*Accepted password for ([a-z]+) from ({_ip})', 'columns':['month','day','hour','minute','second','user','ip']}, 'attacks':{'pattern':f'{_date} .*Invalid user ([a-z,0-9]+) from ({_ip})','columns':['month','day','hour','minute','second','user','ip']}, 'risk':{'pattern':f'{_date} .*Failed password for ([a-z,0-9]+) from ({_ip})','columns':['month','day','hour','minute','second','user','ip']} #-- accounts at risk } _map = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} def risk (_content,_id='user'): """ compute the risk associated with accounts given the counts, this should be indicated by the number of failed password attempts in a given time frame """ _df = pd.DataFrame(_content) _g = _df.groupby([_id]).apply(lambda row: {'start_date':row.date.min(),'end_date':row.date.max() ,'count':row[_id].size} ) _df = pd.DataFrame(_g.tolist()) _df[_id] = _g.index _df.start_date = _df.start_date.astype(str) _df.end_date = _df.end_date.astype(str) return _df def attacks (_content): """ This function will compute counts associated with a given set of ip addresses. If behind a load balancer IP can be ignored and counts will reflect break-in attempts """ return risk(_content,'ip') def login(_content): return risk(_content,'user') def read (**_args): """ :path path of the auth.log files to load """ _year = _args['year'] if 'year' in _args else datetime.now().year _path = _args['path'] f = open(_path) _content = f.read().split('\n') f.close() r = {} for line in _content : for _id in _regex : _pattern = _regex[_id]['pattern'] _columns = _regex[_id]['columns'] _out = re.search(_pattern,line) if _out : try: _object = dict(zip(_columns,_out.groups()[:])) if _id not in r : r[_id] = [] _month = _object['month'] if _month in _map : _object['month'] = _map[ _month ] for field in ['day','month','hour','minute','second'] : _object[field] = int (_object[field]) _object['date'] = datetime ( year=_year,month=_object['month'], day=_object['day'], hour=_object['hour'],minute=_object['minute'],second=_object['second'])#'-'.join([str(_object['month']),str(_object['day'])]) + ' '+_object['time'] # _object['date'] = np.datetime64(_object['date']) r[_id].append(_object) except Exception as e: print(e) pass # # At this point we have essential information formatted # Summarizing this information will serve as a means to compress it # return r