Leechr - a (not yet) multi-thread mass downloader (using FTP, so far)

leechr-preview.png

Console output of a download in progress.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Leechr
======

A (not yet) multi-thread mass downloader (using FTP, so far).

Python 2.5 or later is required.

Future ideas
------------

- Implement recursive directory downloads.  (Should be easy, but has to be
  configurable.)
- Implement resuming of downloads.
- Use threads for multiple downloads in parallel.  (This is already prepared
  and might even work, but would probably wreck the visualization; a new GUI
  interface is required.)
- Expand to other methods beneath FTP; most important is HTTP.  (Compiling the
  list of files to be downloaded requires some rethinking, probably scraping a
  website for all URLs [of a certain pattern] in the case of HTTP.)

Unresolved issues
-----------------

- ``KeyboardInterrupt`` stuff doesn't really work.
- Sometimes unexplored connection hang when trying to retrieve the first file.
- What about symlinks?

:Copyright: 2007-2008 Jochen Kupperschmidt
:Date: 11-Apr-2007
:License: GNU General Public License, Version 2
"""

from __future__ import with_statement
import ftplib
from itertools import cycle
import os
import Queue
import socket
import sys
from sys import argv, exit
import threading
import urlparse


class FileList(list):
    """A list of (filename, size) tuples."""

    def parse_line(self, line):
        """Parse directory listing line and save a tuple (filename, size)."""
        # Skip directories.
        if line[0] == 'd':
            return
        columns = line.split(None, 8)
        self.append((columns[8], int(columns[4])))


class FileDownload(file):
    """A file that is to be, currently is or has been downloaded."""

    def __init__(self, name, target_size, *args, **kwargs):
        file.__init__(self, name, *args, **kwargs)
        self.target_size = int(target_size)
        self.size = 0
        self.percent = 0
        self.last_status_len = 0
        self.spin = cycle(r'-\|/')
        self.display('\n * %-52s  ' % os.path.basename(self.name))

    def retr_block(self, block):
        self.size += float(len(block))
        self.percent = (self.size / self.target_size) * 100
        self.update_display()
        self.write(block)

    def update_display(self, spinner=True):
        """Update the display to show the current status."""
        self.display('\b' * self.last_status_len)
        spinner = self.spin.next() if spinner else ' '
        status = '%7.2f MB (%5.1f%%) %c' % (
            self.size / 1024 / 1024, self.percent, spinner)
        self.last_status_len = len(status)
        self.display(status)

    def close(self):
        self.update_display(spinner=False)
        self.display('\n')
        file.close(self)

    def display(self, s):
        sys.stdout.write(s)


class DownloadSession(ftplib.FTP):
    """A FTP mass download session."""

    def __init__(self, *args, **kwargs):
        ftplib.FTP.__init__(self, *args, **kwargs)
        self.queue = Queue.Queue()

    def start(self, path, target_dir, offset=0, num_threads=1):
        """Start the actual download."""
        self.cwd(path)
        self.target_dir = target_dir
        print 'Starting download from ftp://%s:%d%s' % (
            self.host, self.port, path)
        files = self.get_file_list()[offset:]

        # Start downloader threads.
        for i in range(num_threads):
            t = threading.Thread(target=self.process_queue)
            t.setDaemon(True)
            t.start()

        # Fill queue.
        map(self.queue.put, files)
        # Block until all tasks are done.
        self.queue.join()
        print 'Done.'

    def get_file_list(self):
        """Retrieve a list of available files with meta data."""
        files = FileList()
        self.retrlines('LIST', files.parse_line)
        return files

    def process_queue(self):
        """Retrieve all files waiting in the queue."""
        while True:
            try:
                filename, size = self.queue.get()
            except Queue.Empty:
                break
            self.retr_file(filename, size)
            self.queue.task_done()

    def retr_file(self, filename, size):
        """Retrieve a file block-wise and save it to a local file."""
        target = os.path.join(self.target_dir, filename)
        with FileDownload(target, size, 'wb') as f:
            self.retrbinary('RETR ' + filename, f.retr_block)


def main():
    if len(argv) not in (2, 3, 4):
        print 'usage: %s <ftp:// URL> <target directory> [offset]' % argv[0]
        exit(2)

    scheme, host, path = urlparse.urlsplit(argv[1])[:3]
    if scheme != 'ftp':
        print 'URL has to start with ftp://'
        exit(2)

    target_dir = argv[2]
    if not os.path.exists(target_dir):
        print 'Target directory "%s" does not exist.' % target_dir
        exit(1)

    offset = int(argv[3]) if (len(argv) == 4) else 0
    try:
        ds = DownloadSession(host)
        ds.login()  # anonymous
        ds.start(path, target_dir, offset)
        ds.quit()
    except socket.error, e:
        print '\nSocket error:', e[1]
    except KeyboardInterrupt:
        print '\nCtrl-C pressed, aborting.'

if __name__ == '__main__':
    main()