#!/usr/bin/env python # -*- coding: utf-8 -*- """Find the biggest files. :Copyright: 2008 Jochen Kupperschmidt :Date: 30-Jul-2008 :License: MIT """ from glob import iglob from heapq import heapify, heapreplace from itertools import islice from optparse import OptionParser import os def get_files_info(path, pattern): """Yield the size and name of each file along the path.""" for root, dirs, files in os.walk(path): for filename in iglob(os.path.join(root, pattern)): yield int(os.path.getsize(filename)), filename def identify_biggest_files(files, limit): """Determine the biggest files. ``files`` An iterable of ``(size, filename)`` tuples. ``limit`` The maximum number of files to keep on the heap. A lower value might result in slightly less memory usage. """ # Create the initial heap. biggest = list(islice(files, limit)) heapify(biggest) # Process remaining items. for file_tuple in files: if file_tuple > biggest[0]: heapreplace(biggest, file_tuple) # Sort and return the heap items. biggest.sort(reverse=True) return list(biggest) def main(): # Utilize an option/argument parser. parser = OptionParser(usage='%prog [options] <path>') parser.add_option('-m', '--max-files', dest='max_files', type='int', default=10, help='maximum number of files to show (default: 10)') parser.add_option('-p', '--pattern', dest='pattern', default='*', help='a pattern to narrow down the search, e.g. "*.txt"\n' ' NOTE: The pattern might need to be escaped, possibly using' ' quotes or backslashes, depending on your shell.') opts, args = parser.parse_args() if len(args) != 1: parser.print_help() parser.exit() files = get_files_info(args[0], opts.pattern) biggest_files = identify_biggest_files(files, opts.max_files) # Display biggest files. if biggest_files: tmpl = ' %%%dd %%s' % len(str(biggest_files[0][0])) for file_tuple in biggest_files: print tmpl % file_tuple else: print 'No files were found.' if __name__ == '__main__': main()