Count hits to matching URLs in Apache access logs

#!/usr/bin/ruby

# Skim through Apache access log files and count the number of hits to URLs
# matching a given pattern.
#
# Copyright (c) 2008 Jochen Kupperschmidt <http://homework.nwsnet.de/>
# Version: 07-Aug-2008
# Released under the terms of the MIT License.

require 'zlib'


# Adjust this.
regex = /GET (\/media\/videos\/\d{4}-\d{2}-\d{2})/
# TODO: Might be nice to be able to pass this expression as command line
#       argument.


class HitCounter

  def initialize(regex)
    @regex = regex
    # Create a hash with a default value of zero.
    @hits = Hash.new { |hash, key| hash[key] = 0 }
  end

  def parse_file(fname)
    # Read a file, line by line.  If the file name ends on '.gz', treat it as
    # gzip-compressed and decompress it.  Count the occurences of each matched
    # URL separately.
    puts "Parsing #{fname} ..."
    f = (fname =~ /\.gz$/ ? Zlib::GzipReader : File).open(fname)
    f.each_line { |line| @hits[$+] += 1 if line =~ @regex }
  end

  def show_results
    # Display the results of matching URLs and how often they were accessed.
    puts "\nResults:"
    @hits.each { |details| puts "%s %8d hits" % details }
  end

  def self.batch(regex, fnames)
    # A shortcut class method to parse multiple files and display the result.
    fnames = Dir['access.log*'] if ARGV.empty?
    (STDERR.puts "No files given."; exit 2) if fnames.empty?
    puts "Going to analyze #{fnames.length} files ..."
    instance = self.new(regex)
    begin
      fnames.each { |fname| instance.parse_file(fname) }
      instance.show_results
    rescue Interrupt
      puts "\nAborted."
    end
  end

end


HitCounter::batch(regex, ARGV)