Find the biggest files

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Find the biggest files.

:Copyright: 2008 Jochen Kupperschmidt
:Date: 30-Jul-2008
:License: MIT
"""

from glob import iglob
from heapq import heapify, heapreplace
from itertools import islice
from optparse import OptionParser
import os


def get_files_info(path, pattern):
    """Yield the size and name of each file along the path."""
    for root, dirs, files in os.walk(path):
        for filename in iglob(os.path.join(root, pattern)):
            yield int(os.path.getsize(filename)), filename

def identify_biggest_files(files, limit):
    """Determine the biggest files.

    ``files``
        An iterable of ``(size, filename)`` tuples.
    ``limit``
        The maximum number of files to keep on the heap.  A lower value might
        result in slightly less memory usage.
    """
    # Create the initial heap.
    biggest = list(islice(files, limit))
    heapify(biggest)

    # Process remaining items.
    for file_tuple in files:
        if file_tuple > biggest[0]:
            heapreplace(biggest, file_tuple)

    # Sort and return the heap items.
    biggest.sort(reverse=True)
    return list(biggest)

def main():
    # Utilize an option/argument parser.
    parser = OptionParser(usage='%prog [options] <path>')
    parser.add_option('-m', '--max-files', dest='max_files',
        type='int', default=10,
        help='maximum number of files to show (default: 10)')
    parser.add_option('-p', '--pattern', dest='pattern',
        default='*', help='a pattern to narrow down the search, e.g. "*.txt"\n'
            ' NOTE: The pattern might need to be escaped, possibly using'
            ' quotes or backslashes, depending on your shell.')
    opts, args = parser.parse_args()
    if len(args) != 1:
        parser.print_help()
        parser.exit()

    files = get_files_info(args[0], opts.pattern)
    biggest_files = identify_biggest_files(files, opts.max_files)

    # Display biggest files.
    if biggest_files:
        tmpl = ' %%%dd  %%s' % len(str(biggest_files[0][0]))
        for file_tuple in biggest_files:
            print tmpl % file_tuple
    else:
        print 'No files were found.'

if __name__ == '__main__':
    main()