""" This file is designed to retrieve information on a folder {files,size,hash} """ import subprocess import sys import re import os import pandas as pd import io import datetime import glob class Util : def size(self,stream): PATTERN = '(^.+)([A-Z]+$)' value,units = re.match('^(.+)([A-Z]+$)',stream).groups() value = float(value) if 'G' == units : units = 'GB' # value *= 1000 elif 'K' == units: units = 'KB' # value /= 1000 else : units = 'MB' # units = 'MB' return {"size":value,"units":units} def content(self,stream): return {"content":stream.split(' ')[0].strip()} def read(**args): """ The path can also take in regular expressions """ cmd = {"size":"du -sh :path","content":"find :path -type f -exec md5sum {} + | sort -z|md5sum"} r = {} util = Util() for key in cmd : _cmd = cmd[key] handler = subprocess.Popen(_cmd.replace(':path',args['path']),shell=True,stdout=subprocess.PIPE,encoding='utf-8') stream = handler.communicate()[0] if sys.version_info[0] > 2 : rows = str(stream).split('\n') else: rows = stream.split('\n') if key == 'size' : rows = rows[0] rows = util.size(rows.split('\t')[0]) elif key == 'content' : # # There is a hash key that is generated and should be extracted rows = rows[0] rows = util.content(rows) r = dict(r, **rows) N = 0 if not os.path.exists(args['path']) else len( os.listdir(args['path'])) path = args['path'] if args['path'].endswith('/')else args['path']+os.sep r['path'] = args['path'] r['files']= len([filename for filename in glob.iglob(path+'**/**', recursive=True)]) r['name'] = args['path'].split(os.sep)[-1:][0] r['node'] = os.uname()[1] r['date'] = datetime.datetime.now().strftime('%m-%d-%Y') r['time'] = datetime.datetime.now().strftime('%H:%M:%S') return pd.DataFrame([r]) pass