I have recently found myself employed in a part-time position as a developer/admin for our University's Open Source lab. One of my tasks was to write a script to analyse the NFS-mounted home directories of our 3000+ users. This info is going to be used to design a daily/weekly automated cleanup policy, which isn't my job.
Anyways, here's the code for v0.1.1 (I fixed the sorting problem v0.1 had). It'll get cleaner and easier to use, but for now it does the job and I can move on to tasks with a higher priority than "fiddle and prettify".
#!/usr/bin/python # Some constants: inFileName = "duhomes.txt" # Where we're reading the data from. # This should be output from 'du'. basePath = "/home/export/" # Anything at the beginning of the path to strip. # There resulting path must not have a leading # '/', as this would confuse the tree generator. # Display parameters: sortBy = 'size' sortOrder = 'descending' minsize = 1024 mincount = 10 size = 102400 count = 25 depth = 2 # Grab the file structure into something a little easier to work with. # A tree structure works well, and a dictionary is well suited to building # one. def buildTree(fileName): duTree = {'children': {}} inFile = file(inFileName) def addNode(size, path): if path == "": # Special case: top of tree duTree['size'] = size return dirList = path.split("/") currentNode = duTree while len(dirList) > 0: currentDir = dirList.pop(0) if currentDir not in currentNode['children']: currentNode['children'][currentDir] = {'children': {}} currentNode = currentNode['children'][currentDir] currentNode['size'] = size for line in inFile: splitLine = line.split() dir = splitLine[1].replace(basePath, "") addNode(int(splitLine[0]), dir) return duTree def walkHomes(tree): def recWH(dirName, dirInfo, dirStats): if dirName not in dirStats['children']: dirStats['children'][dirName] = {'count': 0, 'size': 0, 'children': {}} currentDir = dirStats['children'][dirName] currentDir['count'] += 1 currentDir['size'] += dirInfo['size'] for sdName, sdStuff in dirInfo['children'].items(): recWH(sdName, sdStuff, currentDir) dirStats = {'children': {}} for homedir in tree['children'].values(): for dirName, dirInfo in homedir['children'].items(): recWH(dirName, dirInfo, dirStats) return dirStats def displayDirTree(dirTree, depth=None, sortBy='size', sortOrder='descending'): def sortDict(adict, sortBy='size', sortOrder='decending'): tempList = [(v[sortBy], k, v) for k,v in adict.items()] tempList.sort() if sortOrder != 'ascending': tempList.reverse() return [(v[1], v[2]) for v in tempList] def recDDT(level, name, subtree): print "%s%s (%s, %s)" % (" "*level, name, subtree['count'], subtree['size']) if recDDT.depth != None: if level >= recDDT.depth: return for dirName, data in sortDict(subtree['children'], recDDT.sortBy, recDDT.sortOrder): recDDT(level+1, dirName, data) recDDT.depth = depth recDDT.sortBy = sortBy recDDT.sortOrder = sortOrder for dirName, data in sortDict(dirTree['children'], sortBy, sortOrder): recDDT(0, dirName, data) def filterOutput(dirTree, mincount=1, minsize=0, count=20, size=1024): def recFO(dirName, dirInfo, dirSubtree): lump = False if dirInfo['size'] < recFO.minsize and dirInfo['count'] < recFO.mincount: lump = True if dirInfo['size'] < recFO.size or dirInfo['count'] < recFO.count: lump = True if lump: if '*others' not in dirSubtree['children']: dirSubtree['children']['*others'] = {'count': 0, 'size': 0, 'children': {}} dirSubtree['children']['*others']['size'] += dirInfo['size'] dirSubtree['children']['*others']['count'] += dirInfo['count'] else: dirSubtree['children'][dirName] = {'count': dirInfo['count'], 'size': dirInfo['size'], 'children': {}} for name, info in dirInfo['children'].items(): recFO(name, info, dirSubtree['children'][dirName]) filteredTree = {'children': {}} recFO.mincount = mincount recFO.minsize = minsize recFO.count = count recFO.size = size for name, info in dirTree['children'].items(): recFO(name, info, filteredTree) return filteredTree duTree = buildTree(inFileName) dirTree = walkHomes(duTree) filteredTree = filterOutput(dirTree, mincount=mincount, minsize=minsize, count=count, size=size) displayDirTree(filteredTree, depth=depth, sortBy=sortBy)