class Hawler

Attributes

brute[RW]
debug[RW]
depth[RW]
force[RW]
headers[RW]
help[RW]
peek[RW]
proxy[RW]
proxyport[RW]
recurse[RW]
sleep[RW]
types[RW]
verbose[RW]

Public Class Methods

new(uri, block) click to toggle source
# File lib/hawler.rb, line 69
def initialize(uri, block)
  unless (uri =~ /^https?:\/\//) 
    uri = "http://#{uri}"
  end
  
  @uri = uri
  @block = block
  @links = {}
 
  @recurse = false
  @verbose = false
  @debug = false
  @depth = nil 
  @sleep = 0
  @done = false
  @force = false
  @brute = false
  @peek = false
  @types = Hash[ *%w(text/html text/xml application/xml).collect { |v| [v,1] }.flatten ]
  @headers = {}
  @proxy = nil
  @proxyport = nil

  # register some signal handlers.  halt on ctrl-c, enable verbose on SIGUSR1
  # and enable debug on SIGUSR2
  Signal.trap("INT", lambda { @done = true and puts "Terminating -- ctrl-c" })
  Signal.trap("USR1", lambda { @verbose = !@verbose and puts "Enabling verbose mode" })
  Signal.trap("USR2", lambda { @debug = !@debug and puts "Enabling debug mode" })
end

Public Instance Methods

generate_hawlee(link, hawlee) click to toggle source

Simple helper to create a new Hawlee

# File lib/hawler.rb, line 64
def generate_hawlee(link, hawlee)
  print_debug("Queuing #{link} for processing")
  Hawlee.new(link, hawlee.uri, hawlee.depth + 1)
end
start() click to toggle source

Start the Hawler.

# File lib/hawler.rb, line 100
def start
  if (!@recurse)
    @depth = 0 
  end
  @uri = HawlerHelper.valid_uri(@uri) or exit(1)
  hawl(@uri)
end

Private Instance Methods

do_once(uri, referer, what, block) click to toggle source

For every every URI, do something called what which consists of executing block

# File lib/hawler.rb, line 112
def do_once(uri, referer, what, block)
  unless (@links[uri])
    @links[uri] = Hawlee.new(uri, referer, 0)
  end

  if (@links[uri].send("#{what}?"))
    print_debug("Skipping #{uri} (referer #{referer}) -- '#{what}' already called")
  else
    print_verbose("Calling #{what} on #{uri} (referer #{referer})")
    @links[uri].send("#{what}")
    return block.call
  end
end
hawl(uri) click to toggle source
# File lib/hawler.rb, line 126
def hawl(uri)
  # sucks to have to use an array for this, but
  # order is important to achieve something that is close
  # to a breadth-first search
  links_to_process = [] 
  links_to_process << Hawlee.new(uri, nil, 0)

  while (!links_to_process.empty?)
    cur_hawlee = links_to_process.shift

    if (HawlerHelper.offsite?(uri, cur_hawlee.uri))
      unless (@force)
        print_debug("Skipping offsite URI #{cur_hawlee}")
        next
      end
    end

    if (@peek)
      do_once(cur_hawlee.uri, cur_hawlee.referer, :head, lambda {
        if (@depth && cur_hawlee.depth > @depth)
          print_debug("Max recursion depth of #{@depth} at #{cur_hawlee.uri}")
          return false 
        end

        peek_response = HawlerHelper.head(cur_hawlee.uri, cur_hawlee.referer, @headers, @proxy, @proxyport)
        if (peek_response.nil?)
          return false 
        else
          case peek_response
            when Net::HTTPRedirection
              if (HawlerHelper.valid_uri(peek_response['location']))
                redirect = uri.merge(peek_response['location'])
                links_to_process << generate_hawlee(redirect, cur_hawlee)
                return false
              end
          end
          
          # only pass this URI on for retrieval if it's 
          # Content-Type is one that is likely to have links in it.
          if (peek_response.key?("Content-Type"))
            c = peek_response["Content-Type"]
            c.gsub!(/;.*/, "")
            if (@types["#{c}"])
              return true
            else
              return false
            end
          else 
            return true 
          end
        end
      }) or next
    end
    
    response = nil
    do_once(cur_hawlee.uri, cur_hawlee.referer, :get, lambda {
      if (@depth && cur_hawlee.depth > @depth)
        print_debug("Max recursion depth of #{@depth} at #{cur_hawlee.uri}")
      else 
        response = HawlerHelper.get(cur_hawlee.uri, cur_hawlee.referer, @headers, @proxy, @proxyport)
        unless (response.nil?)
          case response
            when Net::HTTPRedirection
              if (HawlerHelper.valid_uri(response['location']))
                redirect = uri.merge(response['location'])
                links_to_process << generate_hawlee(redirect, cur_hawlee)
              end
          end
        end
      end
    })

    unless (response.nil?)
      case response
        when Net::HTTPRedirection
        when Net::HTTPSuccess
          do_once(cur_hawlee.uri, cur_hawlee.referer, :harvest, lambda {
            HawlerHelper.harvest(cur_hawlee.uri, response.body).each do |l|
              links_to_process << generate_hawlee(l, cur_hawlee)
            end

            if (@brute)
              HawlerHelper.brute_from_uri(cur_hawlee.uri).each do |b|
                links_to_process << generate_hawlee(b, cur_hawlee)
              end

              HawlerHelper.brute_from_data(cur_hawlee.uri, response.body) do |b|
                links_to_process << generate_hawlee(b, cur_hawlee)
              end
            end
          })
        end
    end

    do_once(cur_hawlee.uri, cur_hawlee.referer, :analyze, lambda { @block.call(cur_hawlee.uri, cur_hawlee.referer, response) } )

    break if (@done)
    Kernel.sleep(@sleep) if (@sleep)
  end

end
print_debug(msg) click to toggle source

Print debug messages if so desired

print_verbose(msg) click to toggle source

Print verbose messages if so desired