smart-top/smart/logger/__init__.py

"""
This file looks into the logs to determine if there is any intrusion or provides means to assess logs
"""

import pandas as pd
import numpy as np
# import transport
import datetime
import io
import json
import re
from datetime import datetime

_date  = "(^[A-Z][a-z]{2}) ([0-9]{2}) ([0-9]{2})\:([0-9]){2}\:([0-9]{2})"
_ip = "\d+\.\d+\.\d+\.\d+"
_regex = {
    'login':{'pattern':f'{_date} .*Accepted password for ([a-z]+) from ({_ip})', 'columns':['month','day','hour','minute','second','user','ip']},
    'attacks':{'pattern':f'{_date} .*Invalid user ([a-z,0-9]+) from ({_ip})','columns':['month','day','hour','minute','second','user','ip']},
    'risk':{'pattern':f'{_date} .*Failed password for ([a-z,0-9]+) from ({_ip})','columns':['month','day','hour','minute','second','user','ip']} #-- accounts at risk

}
_map = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
def risk (_content,_id='user'):
    """
    compute the risk associated with accounts given the counts, this should be indicated by the number of failed password attempts in a given time frame
    """
    _df  = pd.DataFrame(_content)
    _g = _df.groupby([_id]).apply(lambda row: {'start_date':row.date.min(),'end_date':row.date.max() ,'count':row[_id].size} )
    _df = pd.DataFrame(_g.tolist())
    _df[_id] = _g.index
    _df.start_date = _df.start_date.astype(str)
    _df.end_date = _df.end_date.astype(str)
    return _df
def attacks (_content):
    """
    This function will compute counts associated with a given set of ip addresses. If behind a load balancer IP can be ignored and counts will reflect break-in attempts
    """
    return risk(_content,'ip')
def login(_content):
    return risk(_content,'user')
def read (**_args):
    """
    :path path of the auth.log files to load
    """
    _year = _args['year'] if 'year' in _args else datetime.now().year
    _path = _args['path']
    f = open(_path)
    _content = f.read().split('\n')
    f.close()
    r = {}
    for line in  _content :
        for _id in _regex :
            _pattern = _regex[_id]['pattern']
            _columns = _regex[_id]['columns']

            _out = re.search(_pattern,line)
            if _out :
                try:
                    _object = dict(zip(_columns,_out.groups()[:]))
                    if _id not in r :
                        r[_id] = []
                    _month = _object['month']
                    if _month in _map :
                        _object['month'] = _map[ _month ]
                    for field in ['day','month','hour','minute','second'] :
                        _object[field] = int (_object[field])
                    _object['date'] = datetime ( year=_year,month=_object['month'], day=_object['day'], hour=_object['hour'],minute=_object['minute'],second=_object['second'])#'-'.join([str(_object['month']),str(_object['day'])]) + ' '+_object['time']
                    # _object['date'] = np.datetime64(_object['date'])
                    r[_id].append(_object)
                except Exception as e:
                    print(e)
                    pass
    #
    # At this point we have essential information formatted
    #   Summarizing this information will serve as a means to compress it
    #
    return r