smart-top/smart/logger/__init__.py

"""
This file looks into the logs to determine if there is any intrusion or provides means to assess logs
"""

import pandas as pd
import numpy as np
import transport
import datetime
import io
import json
import re
from datetime import datetime

_date  = "(^[A-Z][a-z]{2}) ([0-9]{2}) ([0-9]{2})\:([0-9]){2}\:([0-9]{2})"
_ip = "\d+\.\d+\.\d+\.\d+"
_regex = {
    'login':{'pattern':f'{_date} .*Accepted password for ([a-z]+) from ({_ip})', 'columns':['month','day','hour','minute','second','user','ip']},
    'attacks':{'pattern':f'{_date} .*Invalid user ([a-z,0-6]+) from ({_ip})','columns':['month','day','hour','minute','second','user','ip']},
    'risk':{'pattern':f'{_date} .*Failed password for ([a-z,0-6]+) from ({_ip})','columns':['month','day','hour','minute','second','user','ip']} #-- accounts at risk

}
_map = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
def risk (_content,_id='user'):
    """
    compute the risk associated with accounts given the counts, this should be indicated by the number of failed password attempts in a given time frame
    """
    _df  = pd.DataFrame(_content)
    _g = _df.groupby([_id]).apply(lambda row: {'start_date':row.date.min(),'end_date':row.date.max() ,'count':row[_id].size} )
    _df = pd.DataFrame(_g.tolist())
    _df['user'] = _g.index
    _df.start_date = _df.start_date.astype(str)
    _df.end_date = _df.end_date.astype(str)
    return _df
def attacks (_content):
    """
    This function will compute counts associated with a given set of ip addresses. If behind a load balancer IP can be ignored and counts will reflect break-in attempts
    """
    return risk(_content,'ip')
def login(_content):
    return risk(_content,'user')
def read (**_args):
    """
    :path path of the auth.log files to load
    """
    _year = _args['year'] if 'year' in _args else datetime.now().year
    _path = _args['path']
    f = open(_path)
    _content = f.read().split('\n')
    f.close()
    r = {}
    for line in  _content :
        for _id in _regex :
            _pattern = _regex[_id]['pattern']
            _columns = _regex[_id]['columns']

            _out = re.search(_pattern,line)
            if _out :
                try:
                    _object = dict(zip(_columns,_out.groups()[:]))
                    if _id not in r :
                        r[_id] = []
                    _month = _object['month']
                    if _month in _map :
                        _object['month'] = _map[ _month ]
                    for field in ['day','month','hour','minute','second'] :
                        _object[field] = int (_object[field])
                    _object['date'] = datetime ( year=_year,month=_object['month'], day=_object['day'], hour=_object['hour'],minute=_object['minute'],second=_object['second'])#'-'.join([str(_object['month']),str(_object['day'])]) + ' '+_object['time']
                    # _object['date'] = np.datetime64(_object['date'])
                    r[_id].append(_object)
                except Exception as e:
                    print(e)
                    pass
    #
    # At this point we have essential information formatted
    #   Summarizing this information will serve as a means to compress it
    #
    return r