# webfetcher.rb
# 
# Author:       Niklas Frykholm (niklas@kagi.com)
# Version:      0.5.5
# Date:         $Date: 2002/01/16 01:16:12 $
#
# LEFT TO DO
#
# BUGS
#
# FIXED IN THIS VERSION
#
# FOR THE FUTURE
#
#   Pages protected by cookies and stupid stuff like that
#   https?
#   ftp?

require 'net/http'
require 'ftools'

## A module that facilitates fetching documents (images and HTML pages) from the
## Internet using the HTTP protocol. It makes it easy to download all the images
## on a page or an entire tree of documents rooted at a certain point.
##
##      require 'webfetcher'
##     
##      include WebFetcher
##
##      book = Page.url('http://www.rubycentral.com/book/') 
##      pages = book.recurse(10).save('pickaxe')
module WebFetcher

    ## This exception is raised if a link containing an unhandled scheme
    ## such as @tt mailto or @tt ftp is encountered. This exception is
    ## not propagated by methods such as @tt extract and @tt recurse.
    ## Instead, it is captured internally and ignored.
    class UnhandledSchemeError < ArgumentError
        attr_reader :scheme     ## The scheme of the url
        attr_reader :url        ## The url itself
        def initialize(scheme, url)
            @scheme, @url = scheme, url
            super("unhandled scheme #{url}")
        end
    end ## UnhandledSchemeError

    ## This exception is raised when a document cannot be reached.
    ## It is propagated to the exception handler of the class.
    class UnreachableDocumentError < RuntimeError
        attr_reader :url     ## The url that we are trying to reach
        attr_reader :resp    ## The response returned by the server
        def initialize(url, resp)
            @url, @resp = url, resp
            super("unreachable document #{url} #{@resp.code} #{@resp.msg}")
        end
    end ## UnreachableDocumentError

    # A parser for extracting tags from a HTML page (not a full blown SGML
    # parser).
    class TagParser
        # Prepares to parse the tags in the specified string
        def initialize(page)
            @tags = []
            page.scan(/<([^<>]*)>/) {|tag,|
                name, attr = self.class.parse_tag(tag)
                @tags << [name, attr]
            }
        end

        # Extracts every tag of the specified type ('a', 'img', etc).
        # If match is nil, every tag is yielded.
        # For each tag, (tag, attr) is yielded.
        # Tag names and attribute keys are converted to lowercase.
        def each(match = nil)
            @tags.each {|name, attr| yield(name, attr) if !match || name==match }
        end

        def TagParser.each(content, match = nil, &p)
            TagParser.new(content).each(match, &p)
        end

        # Parse a tag into a name and a hash of attributes.
        # Names and attribute keys are converted to lowercase.
        def TagParser.parse_tag(tag)
            name, attstr = tag.split(' ', 2)
            attr = {}
            while attstr && !attstr[/\A\s*\Z/]
                if attstr[/\A\s*(\w+)\s*=\s*'([^']*)'(.*)/m] #'
                    # attr='value' quoted attribute
                    key, val, attstr = $1, $2, $3
                elsif attstr[/\A\s*(\w+)\s*=\s*"([^"]*)"(.*)/m] #"
                    # attr="value" quoted attribute
                    key, val, attstr = $1, $2, $3
                elsif attstr[/\A\s*(\w+)\s*=\s*(\S+)(.*)/m] 
                    # attr = value
                    key, val, attstr = $1, $2, $3
                else
                    # attr
                    key, attstr = attstr.split(' ', 2)
                    val = key.downcase
                end
                attr[key.downcase] = val
            end
            [name.downcase, attr]
        end

        # Translate tag attributes. Tag name and attributes are yielded and
        # the return value specifies new attributes. Return nil to leave
        # unchanged.
        def TagParser.translate(page)
            page.gsub(/<([^<>]*)>/) {
                tag = $1
                name, attr = parse_tag(tag)
                res = yield(name, attr)
                "<" + (res ? make_tag(name, res) : tag) + ">"
            }
        end

        # Builds a tag (not including <>) from tag name and a hash of
        # attributes.
        def TagParser.make_tag(name, attr={})
            tag = "#{name}"
            attr.each {|key, val|
                tag << " #{key}"
                if key!=val
                    tag << "="
                    if val[/\A\w+\z/]
                        tag << val
                    elsif val['"'] && !val['\'']
                        tag << "'#{val}'"
                    else
                        val = val.gsub(/"/, '\\"')
                        tag << "\"#{val}\""
                    end
                end
            }
            tag
        end
    end
    
    ## This class represents a downloadable HTTP document. 
    class Page
        ## The name of the host where the document is located.
        ##
        ##      Page.url('http://a.a.a/x/x.html').host   >> 'a.a.a'
        attr_reader :host
        
        ## The port on the host where the document is located.
        ##
        ##      Page.url('http://a.a.a:1200/x/x.html').port   >> 1200
        attr_reader :port

        ## The path of this document
        ##
        ##      Page.url('http://a.a.a:1200/x/x.html').path   >> 'x/x.html'
        attr_reader :path

        ## If the document was derived from a #-link, this is the part
        ## after the #.
        attr_reader :name

        ## If the document was derived from a tag, this is the tag name
        ## (i.e., @tt 'img' for an image tag). (The tag is always in lower case.)
        ## If the document was not generated from a link, tag is @tt nil.
        attr_reader :tag

        ## If the document was generated from a tag, this returns a hash
        ## with the attributes of the tag. Hash keys are always lower case.
        ## If the document was not generated from a tag, @tt attr is @tt {}.
        attr_reader :attr

        ## The name of the proxy to use for downloading documents or @tt nil
        ## if you do not want to use a proxy.
        attr_accessor :proxy_host
        
        ## The port of the proxy that should be used to download documents.
        attr_accessor :proxy_port

        ## Sets the progress monitor (a Proc object) for this document.
        ## The progress monitor is called when the page is downloaded, with
        ## the page as argument.
        ##
        ##      p.progress_monitor = proc {|p| puts "Downloading #{p}"}
        ##
        ## If you do not specify a progress monitor a default progress
        ## monitor is used which prints @ttline "GET <i>page_name</i>"
        ## to @tt $stderr.
        ## To disable this, set @tt progress_monitor to @tt nil.
        ##
        ## Note that the @tt progress_monitor property is inherited when you
        ## run @link extract or @link link, the @tt progress_monitor will monitor the 
        ## download of those pages too.
        # def progress_monitor= proc
        attr_writer :progress_monitor
        
        ## Sets the error handler for this document. The error handler is
        ## called if an exception occurs during a lengthy operation,
        ## such as @link recurse. This can happen, for example, if you
        ## get a network error. In that case, raising an exception is not
        ## always a good idea, since it can abort a very lengthy download.
        ## So instead of raising an exception, the program calls the error
        ## handler (a @tt Proc) with the exception.
        ##
        ##       p.error_handler = proc {|x| raise unless x.class==RuntimeError}
        ##
        ## The default error handler prints the error to @tt $stderr, but does
        ## not abort the download. If you want to change this behavior, you
        ## must set @tt error_handler.
        ## Note that the error handler is inherited when you do @link link or 
        ## @link extract.
        # def error_handler= proc
        attr_writer :error_handler

        # sets the cookies, used by parent
        attr_writer :cookies


        # Specifies which attributes of which tags should be searched for URLs
        URLATTR = {'a' => 'href', 'img' => 'src', 'layer' => 'src',
            'bgsound' => 'src', 'area' => 'href', 'embed' => 'src',
            'body' => 'background', 'frame' => 'src', 'script' => 'src',
            'applet' => 'codebase', 'link' => 'href'}

        # A list with type arguments
        TYPES = URLATTR.keys << :all_types << :all

        # A list with location arguments
        LOCATIONS = [:external, :server, :dir, :subdir, :all_locations, :all]

        # Maps mime types to file endings
        MIME = {'application/mac-binhex40' => 'hqx', 'applicaiton/msword' => 'doc',
                'application/pdf' => 'pdf', 'application/postscript' => 'ps',
                'application/rtf' => 'rft', 'application/x-dvi' => 'dvi',
                'application/x-latex' => 'tex', 'application/x-stuffit' => 'sit',
                'application/x-tex' => 'tex', 'application/troff' => 'troff',
                'application/x-zip' => 'zip', 'application/x-gzip' => 'gz',
                'application/x-tar' => 'tar', 'application/x-excel' => 'xls',
                'application/x-compress' => 'Z', 'audio/basic' => 'au',
                'audio/midi' => 'midi', 'audio/x-aiff' => 'aiff',
                'audio/x-wav' => 'wav', 'image/gif' => 'gif', 'image/jpeg' => 'jpeg',
                'image/png' => 'png', 'image/tiff' => 'tiff', 'text/html' => 'html',
                'text/plain' => 'txt', 'video/mpeg' => 'mpeg', 'video/mov' => 'mov',
                'video/x-msvideo' => 'avi', 'audio/x-mpeg' => 'mp3'}

        ## Creates a new page at the specified @var host, @var path and @var port.
        ## @var proxy_host and @var proxy_port specify the location of the proxy.
        ## Use @tt nil for @var proxy_host if you do not want to use a proxy.
        ## @var tag and @var attr are used to set the @link tag and @link attr
        ## attributes. @var name sets the name attribute.
        ##
        ## If a block is given it is used as @link progress_monitor=, otherwise
        ## the default progress monitor is used.
        ##
        ##      p = Page.new('www.acc.umu.se', '/')
        ##
        ## Most of the time it is probably simpler to use @link Page.url.
        def initialize(host, path='/', port=80, proxy_host=nil, \
                proxy_port=8080, tag = nil, attr={}, name=nil, &progress_monitor)
            @host, @path, @port = host, path, port
            @proxy_host, @proxy_port = proxy_host, proxy_port
            @tag, @attr, @name = tag, attr, name
            @progress_monitor = progress_monitor || proc {|x| $stderr.puts "GET #{x}"}
            @error_handler = proc {|ex| $stderr.puts ex}
            @content = nil
            @resp = nil
            @cookies = []
        end

        # Parse an url into [scheme, host, port, path, name]
        def Page.parse_url(url, _scheme='http', _host=nil, _port=80, _path='/')
            if (url.split(/\//)[0] || "")[':']
                # scheme url
                (scheme, schemepart) = url.split(':',2)
                return [scheme, schemepart] if !%w(http ftp).include?(scheme)
                if url[/\/\/([^\/]*)(.*)/]
                    hostport, path = $1, ($2 == "" ? nil : $2)
                    host, port = hostport.split(/:/, 2)
                    port = port.to_i if port
                    return [scheme, host, port || 80, *((path || "/").split('#'))]
                end
            elsif url[0] == ?/
                # absolute url
                return [_scheme, _host, _port, *(url.split '#')]
            elsif url[0] == ?#
                # local url
                return [_scheme, _host, _port, _path, url[1..-1]]
            elsif url[0] == ??
              # relative url to the name of current file
              path = _path.split(/\?/)[0] + url
              return [_scheme, _host, _port, *(path.split '#')]
            else
                # relative url
                _dirname = File.dirname(_path)
                _dirs = _dirname.split(/\//)
                while url[0..2] == '../' do
                  _dirs.pop
                  url = url[3..-1]
                end
                path = _dirs.join('/') + '/' + url
                return [_scheme, _host, _port, *(path.split '#')]
            end
        end

        ## Creates a new page from an URL. The remaining argument are as for
        ## @link new.
        ##
        ## Raises @link UnhandledSchemeError if the link is a scheme
        ## not handled by this program (such as mailto).
        ##
        ##      p = Page.url('http://www.acc.umu.se/')
        def Page.url(url, proxy_host = nil, proxy_port = 8080,
            tag=nil, attr={}, &progress_monitor)
            scheme, host, port, path, name = parse_url(url)
            if scheme != 'http'
                raise UnhandledSchemeError.new(scheme, url)
            end
            self.new(host, path, port, proxy_host, proxy_port, tag, attr, name,
                &progress_monitor)
        end

        ## Creates a new page from a link URL on the current page. The URL
        ## can be either an absolute or a relative URL. The @link progress_monitor=
        ## and @link error_handler= of the current page are inherited by the
        ## new page. @var tag and @var attr sets the @link tag and @link attr
        ## attributes of the new page.
        ##
        ## If the link is not a http-link, @link UnhandledSchemeError is
        ## raised.
        ##
        ##      page = current_page.link('../index.html')
        def link(url, tag=nil, attr={})
            scheme, host, port, the_path, name = self.class.parse_url(url, 'http', @host,
                @port, true_path)
            if scheme != 'http'
                raise UnhandledSchemeError.new(scheme, url)
            end
            res = self.class.new(host,the_path,port,@proxy_host,@proxy_port, tag, attr,
                name, &@progress_monitor)
            res.error_handler = @error_handler
            res.cookies = @cookies.dup
            res
        end

        ## Returns the url of this page.
        ## If @var with_name is true the @link name part is included in 
        ## the url.
        def url(with_name = nil)
            "http://#{@host}" + (@port == 80 ? "" : ":#{@port}") + @path + \
                (with_name && @name ? '#' + @name : '')
        end

        # Internal, parses a set-cookie header and adds it to the cookie
        # store.
        def set_cookie(text)
            str = text.split(/;/)[0]
            @cookies << str
        end

        # Returns a cookie string containing all the set cookies
        def cookie_string
            @cookies.join(';')
        end

        # Fetch page to local cache
        def fetch
            return if @content
            @progress_monitor.call(self) if @progress_monitor

            version_1_2 = Net::HTTP.respond_to?(:is_version_1_2?) && \
                Net::HTTP.is_version_1_2?

            if version_1_2
		fetch_1_2
            else
		fetch_1_1
            end
        end

	def fetch_1_2
	    1.times {
		Net::HTTP.start(@host, @port, @proxy_host, @proxy_port) {|http|
		    @resp = http.get(@path, {'Cookie' => cookie_string})
		    @content = @resp.body

		    # Update path if it ends in '/' and a content-location is
		    # provided.
		    if @path[-1] == ?/ && @resp['content-location']
			@path << File.basename(@resp['content-location'])
		    elsif @path[-1] == ?/
			@path << "index.html"
		    end
		}
		if @resp['set-cookie']
		    set_cookie(@resp['set-cookie'])
		end

		if (@resp.code.to_i==301 || @resp.code.to_i==302) && @resp['location']
		    redir = link(@resp['location'])
		    @host, @port, @path = redir.host, redir.port, redir.path
		    redo
		elsif @resp.code.to_i!=200
		    raise UnreachableDocumentError.new(url, resp)
		end
	    }
	end
	private :fetch_1_2

	def fetch_1_1
	    begin
		Net::HTTP.start(@host, @port, @proxy_host, @proxy_port) {|http|
		    @resp, @body = http.get(@path, {'Cookie' => cookie_string})
		    @content = @resp.body

		    # Update path if it ends in '/' and a content-location is
		    # provided.
		    if @path[-1] == ?/ && @resp['content-location']
			@path << File.basename(@resp['content-location'])
		    elsif @path[-1] == ?/
			@path << "index.html"
		    end
		}
	    rescue Net::ProtoRetriableError => err
                begin
                    @resp = err.response
                rescue NameError
                    @resp = err.data
                end
		if @resp['set-cookie']
		    set_cookie(@resp['set-cookie'])
		end
		if @resp['location']
		    redir = link(@resp['location'])
		    @host, @port, @path = redir.host, redir.port, redir.path
		    retry
		else
		    raise UnreachableDocumentError.new(url, resp)
		end
	    end
	    if @resp['set-cookie']
		set_cookie(@resp['set-cookie'])
	    end
	end
	private :fetch_1_1

        ## Returns the content of the document. If the document has not been
        ## downloaded yet, a download is initiated.
        def content
            fetch unless @content
            @content
        end

        ## Returns http response header retrieved when the document was fetched.
        ## If the document has not been fetched yet a download is initiated.
        def resp
            fetch unless @resp
            @resp
        end

        ## Returns @tt true if the current page is an HTML page.
        ##
        ## WebFetcher tries to determine the type of the page by looking
        ## at the link (to see if it ends in @tt ".html", etc). If the result
        ## of this is inconclusive, this method will fetch the page to
        ## get a @tt content-type header.
        def html?
            return false if @tag == 'img'
            ext = path().split(/\./)[-1].downcase
            return false if %w(gif jpeg jpg bmp png).include?(ext)
            return true if %w(htm html shtm shtml).include?(ext)
            resp['content-type'] == 'text/html'
        end

        ## Returns @tt true if this page is an image.
        ##
        ## This method looks at the extension, the @link tag attribute
        ## and (if the document has been fetched) the @tt content-type. Unlike
        ## @link html?, it never initiates a download.
        def image?
            if @resp
                @resp['content-type'][/^image/]
            else
                return true if @tag == 'img'
                ext = path().split(/\./)[-1].downcase
                %w(gif jpeg jpg bmp png tif tiff eps).include?(ext)
            end
        end

        ## Returns the true path of the document.
        ##
        ## Sometimes the path of a document cannot be determined from its
        ## URL. If the URL points to a directory, the document could
        ## be a document in that directory named @tt "index.html", @tt "index.htm",
        ## @tt "index.php" or something else.
        ##
        ## The only way of determining the true path of the document is to
        ## download it and check the @tt content-location header. The @link
        ## path method does not do that, since it is a potentially costly
        ## operation.
        ##
        ## Sometimes, however, you need to know the true path of a document.
        ## This method checks the URL and if it looks like it points to
        ## a directory, it downloads the document to get a @tt content-location
        ## header. It then returns the true path.
        ## (Note that some webservers do not send these headers even
        ## though they ought to, so this method is not guaranteed to return
        ## a correct result.)
        ##
        ## After you have called @tt true_path once, the result is cached, so
        ## subsequent calls to @link path and @link url will use the true
        ## path.
        def true_path
            # If page has been fetched, is an image or has a "." in the base
            # name, we take it as the true path, otherwise, fetch the page.
            fetch unless (@resp || File.basename(@path)['.'] || @path['?'])
            @path
        end

        ## Returns the directory part of @link path.
        ##
        ##      Page.url('http://a.a.a/x/x.html').dirname   >> '/x'
        def dirname
            File.dirname(@path)
        end

        ## Returns the file name part of @link path.
        ##
        ##      Page.url('http://a.a.a/x/x.html').basename   >> 'x.html'
        def basename
            File.basename(@path)
        end
        
        ## Returns a suitable file name extension for storing this object.
        ##
        ## For fetched documents, the mime-type is used, otherwise the
        ## extension is extracted from the URL.
        ## If the URL looks like a link to a CGI-script the document is
        ## fetched to determine the mime-type.
        ##
        ##      Page.url('http://a.a.a/x.gif').ext   >> 'gif'
        def ext
            fetch if @path['?'] || @path[/\.cgi$/] || @path[/\.asp$/]
            if @resp
                ct = resp['content-type']
                ct = ct.split(';')[0] if ct
                return MIME[ct] if MIME[ct]
            end
            a = basename.split(/\./)
            if a.size > 1 && a[-1].size < 6
                a[-1]
            else
                'html'
            end
        end

        ## Returns a set of pages extracted from the links on this page.
        ##
        ## @var options determines how the links are extracted. There are
        ## two types of options, options specifying which types of links
        ## should be extracted (anchors, images, etc), and options that
        ## specify where the documents we are interested in are located.
        ##
        ## A type option is either the special name @tt :all_types which
        ## specifies that all types of links should be extracted or the
        ## name of a tag (a @tt String) which links should be extracted. You
        ## can specify as many type otpions as you want. Tags currently
        ## supported by this module are: @tt a, @tt img, @tt layer, 
        ## @tt bgsound, @tt area, @tt embed, @tt body, @tt frame, @tt script,
        ## @tt applet and @tt link.
        ##
        ## The location option can be @tt :external, @tt :server, @tt :subdir,
        ## @tt :dir, @tt :all_locations or any combination of these.
        ## @tt :external denotes links to other servers.
        ## @tt :server links anywhere on the current server
        ## @tt :subdir links to the current directory and all its subdirectories.
        ## @tt :dir    links to the current directory only.
        ## @tt :all_locations specify that all locations should be extracted.
        ##
        ## If you include a Page in the @var options, all location arguments,
        ## such as :external, :server, etc, will be relative to that page
        ## instead of the current page.
        ##
        ## The special keyword @tt :all is the same as specifying @tt
        ## :all_locations and @tt :all_types.
        ##
        ## If you do not specify a type, @tt :all_types is assumed. If you do
        ## not specify a location @tt :server is assumed.
        ##
        ## The extracted links are returned as a @link PageCollection
        ##
        ##      images = page.extract('img', :subdir)
        def extract(*options)
            return PageCollection.new(&@error_handler) unless html?
            options << :all_types if (options & TYPES).size == 0
            options << :server    if (options & LOCATIONS).size == 0
            ref_page = nil
            options.each {|x| ref_page = x if x.kind_of?(Page)}
            ref_page = self unless ref_page

            # extract links of right type
            queue = []
            TagParser.each(content) {|tag, attr|
                next unless URLATTR[tag]
                next unless (options & [:all, :all_types, tag]).size > 0
                url = attr[URLATTR[tag]]
                next if !url || url[0] == ?#

                urls = [url]
                urls = find_script_links(url) if url[/\Ajavascript:/]
                urls.each {|u|
                    begin
                        queue << link(u, tag, attr)
                    rescue TimeoutError
                        @error_handler.call($!)
                    rescue Interrupt, SystemExit
                        raise
                    rescue UnhandledSchemeError
                    rescue Exception
                        @error_handler.call($!)
                    end
                }
            }

            # Add links at right location
            pc = PageCollection.new(&@error_handler)
            queue.each {|p|
                loc = [:all, :all_locations]
                if p.host != ref_page.host
                    loc << :external
                else
                    loc << :server
                    loc << :dir if ref_page.dirname == p.dirname
                    loc << :subdir if \
                        p.path[/\A#{Regexp.escape(ref_page.dirname)}/]
                end
                pc << p if (loc & options).size > 0
            }
            pc.uniq!
            pc
        end

        # Some pesky sites put links in javascript, horrible practice.
        # We try to extract something that looks like a link from the script.
        # (I'm not sure about the success rate of this method.)
        def find_script_links(href)
            good = [];
            href.scan(/['"]([^'"]*)['"]/) {|link,|
                ext = link.split(/\./)[-1]
                if link[/^http:/] || link[0] == ?/ ||
                    %w(html htm shtml cgi gif jpg jpeg png).include?(ext)
                    good << link
                end
            }
            good
        end
        private :find_script_links

        ## Returns a @link PageCollection with all the
        ## documents linked from this page.
        ##
        ## This is equivalent to calling
        ##  @ttline extract('a', 'area', :all_locations)
        def links
            extract('a', 'area', :all_locations)
        end

        ## Returns a @link PageCollection with all the images
        ## on this page.
        ##
        ## This is equivalent to calling
        ## @ttline extract('img', :all_locations)
        def images
            extract('img', :all_locations)
        end

        ## Returns a @link PageCollection consisting of
        ## this page and all the documents directly visible on it: frames,
        ## stylesheets, images, etc.
        def rich_page
            pc = extract(:all_locations, 'img', 'bgsound', 'embed', 'body', 'applet',
                'link')
            pc << self
            nested = extract(:all_locations, 'layer', 'frame')
            nested_pages = nested.rich_page
            nested_pages.each {|p| pc << p}
            pc.uniq!
            pc
        end

        ## Returns a @link PageCollection consisting of this page
        ## and all the pages reachable from it by recursively following the links.
        ##
        ## @var level specifies how deep the recursion should nest. 
        ## If @var level is 0 only this page is returned. If @var level
        ## is 1, this page and all links on it are returned, and so on.
        ## If you specify @tt nil for recursion, it will visit every
        ## linked page. You should never use @tt nil and the @tt :external
        ## option simultaneously. If you do, the program will most likely
        ## try to download the entire web.
        ##
        ## @var options are the same as for @link extract, but the default
        ## options are different. The default type option is @tt :all_types
        ## and the default location option is @tt :subdir. The current
        ## page is used as reference page for all location arguments.
        ##
        ## If you call the method with a block, progress updates will be
        ## sent to the block. @var page is the page that the recursion
        ## currently is looking at. @var i is the index of that page, it
        ## starts at 0 and increses as the recursion progresses. 
        ## @var total is the total number of documents in the recursion
        ## queue. When @var i reaches @var total the recursion will stop,
        ## but note that @var total will increase as the recursion 
        ## progresses.
        ##
        ##      pc = page.recurse(10) {|p,i,t| puts "#{p} (#{i} of #{t})"}
        # def recurse(level = nil, *options) {|page, i, total| ...}
        def recurse(level = nil, *options, &progress)
            options << :all_types if (options & TYPES).size == 0
            options << :subdir if (options & LOCATIONS).size == 0
            options.unshift(self)

            pc = PageCollection.new(&@error_handler)
            queue = [[self, level]]
            seen = {}
            queued = {self => true}
            i = 0
            while queue.size > 0
                page, level = queue.shift
                total = i + (queue.find_all {|p,l| !l || l>0}.size)
                progress.call(page, i, total) if progress
                begin
                    # Check before and after true path to speed up
                    next if seen[page]
                    page.true_path
                    next if seen[page]
                    pc << page
                    seen[page] = true
                    if !level || level > 0
                        page.extract(*options).each {|link|
                            next if queued[link]
                            queue << [link, level ? level-1 : nil]
                            queued[link] = true
                        }
                    end
		rescue TimeoutError
		    @error_handler.call($!)
                rescue Interrupt, SystemExit
                    raise
                rescue UnhandledSchemeError
                rescue Exception
                    @error_handler.call($!)
                end
                i += 1
            end
            pc.uniq!
            pc
        end

        # Return a full path suitable for storing this page in a tree
        # structure, with one directory for the host, etc
        def tree_path
            true_path
            name = basename
            name = 'index.html' if name == ''
            dir = dirname[1..-1]
            path = File.join(@host, dir, name)
            # Sanity check
            if path[0] == ?/ || path['..']
                raise "Suspicious path #{path}"
            end
            path
        end

        ## Saves this document to the specified @var path.
        def save(path)
            File.makedirs(File.dirname(path))
            File.open(path, 'wb') {|f| f.write(content)}
        end

        ## Saves this document to the specified @var path, using 
        ## @var url_map to translate the URLs in the document to
        ## local links. The @var url_map should be a hash associating URLs 
        ## with the paths were they are stored locally.
        ##
        ## If @var absolutize is true, all the relative links in the
        ## file that point to documents not in @var urlmap are converted
        ## to absolute links. This ensures that all links work as previously,
        ## even though the file is stored locally.
        ##
        ## You can use this method to convert the links on the
        ## documents you are downloading so that they point to the
        ## downloaded documents on your hard drive instead of to
        ## external servers. But it is probably easier to use
        ## @link PageCollection#save.
        def save_translated(path, urlmap = {}, absolutize = true)
            new_content = translate(content, path, urlmap, absolutize)
            File.makedirs(File.dirname(path))
            File.open(path, 'wb') {|f| f.write(new_content)}
        end
        
        ## Returns the page content as a string with translated URLs. 
        ## Arguments are the same as for save_translated.
        def content_translated(urlmap = {}, absolutize = true)
            return translate(content, path, urlmap, absolutize)
        end

        # translates the links in a page
        def translate(content, path, urlmap = {}, absolutize = true)
            TagParser.translate(content) {|tag, attr|
                (urlattr = URLATTR[tag]) && (href = attr[urlattr]) && \
                    (url = translate_href(href, path, urlmap, absolutize))
                if url
                    attr[urlattr] = url
                    attr
                else
                    nil
                end
            }
        end

        # translate a single href attribute, returns nil to leave 
        # unchanged
        def translate_href(href, path, urlmap = {}, absolutize = true)
            begin
                translate_page_ref(link(href), path, urlmap, absolutize &
                    !href[':'])
            rescue TimeoutError
	        @error_handler.call($!)
		return nil
            rescue Interrupt, SystemExit
                raise
            rescue UnhandledSchemeError
                return nil
            rescue Exception
                @error_handler.call($!)
                return nil
            end
        end

        # Translate a link to a Page object
        def translate_page_ref(page, path, urlmap = {}, absolutize = true)
            url = page.url
            guesspaths = ['', '/', '/index.html', '/index.htm', '/index.php']
            guesspaths.each {|gp|
                urlpath = urlmap[url + gp]
                if urlpath
                    return self.class.relative_path(path, urlpath) + \
                        (page.name ? '#' + page.name : '')
                end
            }
            absolutize ? page.url(true) : nil
        end

        # Finds a good expression of the path from from to to
        def Page.relative_path(from, to)
            return '' if from==to
            fromdir = from.split(File::SEPARATOR)
            todir = to.split(File::SEPARATOR)
            while fromdir.size > 1 && todir.size > 1 && fromdir[0] == todir[0]
                fromdir.shift
                todir.shift
            end
            linkdir = ['..']*(fromdir.size-1) + todir
            File.join(linkdir)
        end

        ## Returns a string representation of this document.
        ## Equivalent to @link url.
        def to_s
            url
        end

        ## Two pages are considiered equal if they have the same URL.
        ## (This definition is also used for hashing.)
        def ==(o)
            url == o.url
        end

        def eql?(o); self.class == o.class && url.eql?(o.url); end
        def hash; url.hash; end

        ## Returns the largest image on this page. This method does not
        ## actually download the images to check their sizes, it only
        ## looks at the height and width attributes. It returns @tt nil if their
        ## are no images with specified height and width.
        def largest_image
            best = [nil, 0]
            images.each {|i|
                size = (i.attr['height'].to_i || 0) * (i.attr['width'].to_i || 0)
                best = [i,size] if size > best[1]
            }
            best[0]
        end
    end ## Page
  
    ## This class is used to represent a collection of pages. It is basically
    ## an @tt Array of pages with some added convenience methods.
    ##
    ## There is currently some debate about what you should do when you create
    ## a subclass of @tt Array. Should you reimplement all or some of the methods
    ## in @tt Array, so that they return your new subtype instead of an @tt Array.
    ## See <a href="http://www.rubygarden.com/article.php?sid=104">RCR #38</a>
    ## on RubyGarden.
    ##
    ## I do no such reimplementations in this class. I count that Matz will
    ## make the right decision in which methods to change and which not to
    ## and think it is best to follow whatever standard he sets, unless there are
    ## special reasons not to.
    ##
    ## If you want to do array operations, you must convert the result back
    ## to a PageCollection manually.
    ##
    ##     pages = PageCollection.new(page1.links + page2.images)
    class PageCollection < Array
        ## The error handler works as @link Page#error_handler=.
        attr_writer :error_handler

        ## Creates a new page collection. @var pages are the pages in the
        ## collection. If a block is given it is used as @link error_handler=.
        def initialize(pages = [], &error_handler)
            super(0)
            pages.each {|p| self << p}
            @error_handler = error_handler || proc {|ex| $stderr.puts ex}
        end

        ## Creates a page collection from an array of urls.
        ## @var proxy and @var proxy_port specify the proxy to use if any.
        def PageCollection.urls(urls, proxy = nil, proxy_port = 8080)
            PageCollection.new(urls.collect{|x|
                Page.url(x, proxy, proxy_port)})
        end
        
        ## Creates a new page collection by calling @link Page#extract on 
        ## each document in this collection.
        # def extract(*options)

        ## Creates a new page collection by calling @link Page#rich_page on 
        ## each document in this collection.
        # def rich_page

        ## Creates a new page collection by calling @link Page#images on 
        ## each document in this collection.
        # def images

        ## Creates a new page collection by calling @link Page#largest_image on 
        ## each document in this collection.
        # def largest_image

        ## Creates a new page collection by calling @link Page#links on 
        ## each document in this collection.
        # def links

        ## Creates a new page collection by calling @link Page#recurse on 
        ## each document in this collection.
        # def recurse(level=nil, *options)
        
        %w(extract rich_page images largest_image links recurse).each {|m|
            module_eval <<-METHOD
            def #{m}(*args)
                pc = PageCollection.new
                pc.error_handler = @error_handler
                each {|page|
                    begin
                        res = page.#{m}(*args)
                        if res.kind_of?(Array)
                            res.each {|p| pc << p}
                        else
                            pc << res
                        end
	            rescue TimeoutError
		        @error_handler.call($!)
                    rescue Interrupt, SystemExit
                        raise
                    rescue Exception
                        @error_handler.call($!)
                    end
                }
                pc
            end
            METHOD
        }
        
        # Returns the longest common save-path prefix.
        def save_path_prefix
            return "" if size==0
            prefix = self[0].tree_path
            each {|f|
                path = f.tree_path
                i = 0
                i+=1 while prefix[i] && prefix[i]==path[i]
                prefix = prefix[0,i]
            }
            if prefix[-1] != ?/
                prefix = prefix[/.*\//]
            end
            return prefix || ""
        end
        private :save_path_prefix

        ## Saves the pages in this collection to the directory @var rootdir.
        ## @var options can be used to control how the pages should be
        ## saved.
        ##
        ## With the option @tt :translate (default), any internal links
        ## between the pages in the collection are translated to point to
        ## the file where the page is saved instead of the original URL.
        ## Relative links to pages not in the collection are converted to
        ## absolute links to ensure that they still work. This is usually
        ## what you want. All links will work as normal, but the pages
        ## in the collection will be cached on your hard drive allowing
        ## fast access whether a network connection is available or not.
        ## Use the option @tt :notranslate to save the pages just as they
        ## are without translating the links.
        ##
        ## With @tt :flat (default) all the files are saved directly under
        ## the specified directory. This is usually what you want. Note that
        ## this does not break the links unless you use @tt :notranslate.
        ## If you
        ## specify @tt :tree instead, the files are saved in a structure
        ## mirroring the layout on the server. 
        ##
        ## If you specify @tt :rename (default), the files are renamed if
        ## there already exists a file with that name in the directory.
        ## Note that this doesn't break the links as long as @tt :translate is
        ## on. With @tt :overwrite, files with the same name are overwritten.
        ## If there are several pages in the collection with the same name
        ## they will overwrite each other. If you use @tt :rename_all, all
        ## the pages in the collection are renamed before they are saved.
        ## This can be useful if the files have strange names.
        ##
        ## When files are renamed they are given the names @tt 1.html
        ## @tt 2.html, etc. The extension is adapted to the file type.
        ## Regardless of what rename setting is used, strange links, such as
        ## CGI script links are always renamed.
        ##
        ## If a block is given, it is called for each document
        ## in the collection as it is saved. @var path specifies the path where
        ## the document was saved, @var page, the page that was saved. 
        ## @var i is the index of the saved page and @var size the total number
        ## of pages.
        ##
        ##      # Save flat, translated in current directory
        ##      pages.save('.') {|f,p,i,s| puts "#{f} (#{i} of #{s})"}
        ##      
        ##      # Save in a tree structure with no translation
        ##      pages.save('.', :tree, :overwrite, :notranslate)
        # def save(rootdir='.', *options) {|path, page, i, size| ...}
        def save(rootdir='.', *options, &progress)
            translate = !options.include?(:notranslate)
            tree = options.include?(:tree)
            rename = (options & [:overwrite, :rename, :rename_all])[0] || :rename

            each {|page| 
                begin
                    page.true_path
		rescue TimeoutError
		    @error_handler.call($!)
                rescue Interrupt, SystemExit
                    raise
                rescue Exception
                    @error_handler.call($!)
                end
            }
            uniq!

            prefix = save_path_prefix if tree

            # Find a save path for each file
            used_paths = {}
            page_to_path = {}
            url_to_path = {}
            counter = 1
            each {|page|
                begin
                    dir = nil
                    if tree
                        dir = File.dirname(page.tree_path)
                        if dir + "/" == prefix
                            dir = rootdir
                        else
                            dir[prefix] = ''
                            dir = File.join(rootdir, dir)
                        end
                    else
                        dir = rootdir
                    end

                    page.true_path
                    name = nil
                    rename_this = page.url['?'] || page.url[/\.cgi$/] \
                        || page.url[/\.asp$/]
                    if rename == :rename_all || rename_this
                        name = "#{counter}.#{page.ext}"
                        counter+=1
                    else
                        name = page.basename
                        name = "index.html" if name == ""
                    end
                    path = catch(:found) do
                        while 1
                            p = File.join(dir, name)
                            throw(:found, p) if rename == :overwrite && !rename_this
                            throw(:found, p) if !File.exist?(p) && !used_paths[p]
                            name = "#{counter}.#{page.ext}"
                            counter += 1
                        end
                    end
                    used_paths[path] = true
                    page_to_path[page] = path
                    url_to_path[page.url] = path
		rescue TimeoutError
		    @error_handler.call($!)
                rescue Interrupt, SystemExit
                    raise
                rescue Exception
                    @error_handler.call($!)
                end
            }

            # Save files
            each_with_index {|page, i|
                begin
                    path = page_to_path[page]
                    progress.call(path, page, i, size) if progress
                    if translate
                        page.save_translated(path, url_to_path)
                    else
                        page.save(path)
                    end
		rescue TimeoutError
		    @error_handler.call($!)
                rescue Interrupt, SystemExit
                    raise
                rescue Exception
                    @error_handler.call($!)
                end
            }
        end
    end ## class PageCollection
end ## module WebFetcher

if __FILE__ == $0
    require "runit/testcase"
    require 'runit/cui/testrunner'
    require 'runit/testsuite'

    include WebFetcher

    class Testing_TagParser < RUNIT::TestCase
        def test_parse_tag
            x = TagParser.parse_tag("a")
            assert_equal(['a', {}], x)
            x = TagParser.parse_tag("a BB")
            assert_equal(['a', {'bb'=>'bb'}], x)
            x = TagParser.parse_tag(" gg cc='\"'   \n DD = \"'\" ff")
            assert_equal(['gg', {'cc'=>'"', 'dd'=>"'", 'ff'=>'ff'}], x)
            x = TagParser.parse_tag("a href=\"/ads/\"")
            assert_equal(['a', {'href'=>'/ads/'}], x)
        end

        def test_each
            arr = []; TagParser.each("<a> <b> <c>") {|x| arr << x}
            assert_equal([['a',{}], ['b',{}], ['c', {}]], arr)
            arr = []; TagParser.each("<A \n X>") {|x| arr << x}
            assert_equal([['a',{'x'=>'x'}]], arr)
            p = TagParser.new("<a> <b> <c> <B D=x>")
            arr = []; p.each('b') {|name, attr| arr << name}
            assert_equal(['b', 'b'], arr)
        end

        def test_translate
            assert_equal(TagParser.translate("<a>") {|x| {'to'=>'ti'}},
                "<a to=ti>")
            assert_equal(TagParser.translate("<a>") {|x| nil}, "<a>")
            assert_equal(TagParser.translate("<a>") {|x| {'to'=>'"'}},
                "<a to='\"'>")
            assert_equal(TagParser.translate("<a>") {|x| {'to'=>'"\''}},
                "<a to=\"\\\"'\">")
            assert_equal(TagParser.translate('<a>') {|x| {'x' => ""}},
                '<a x="">')
        end
    end

    class Testing_Page < RUNIT::TestCase
        def test_parse_url
            x = Page.parse_url('http://www.test.com:1200/ho.html#down')
            assert_equal(['http', 'www.test.com', 1200, '/ho.html', 'down'], x)
            x = Page.parse_url('http://www.test.com')
            assert_equal(['http', 'www.test.com', 80, '/'], x)
            x = Page.parse_url('/absolute/path.html', 'http', 'a.a.a', 80,
                '/x/x.html')
            assert_equal(['http', 'a.a.a', 80, '/absolute/path.html'], x)
            x = Page.parse_url('rel/path.html', 'http', 'a.a.a', 80, '/x/x.html')
            assert_equal(['http', 'a.a.a', 80, '/x/rel/path.html'], x)
            x = Page.parse_url('mailto:niklas@kagi.com')
            assert_equal(['mailto', 'niklas@kagi.com'], x)
            x = Page.parse_url('/ads/', 'http', 'a.a.a', 80, '/x/x.html')
            assert_equal(['http', 'a.a.a', 80, '/ads/'], x)
            x = Page.parse_url('#name', 'http', 'a.a.a', 80, '/x/x.html')
            assert_equal(['http', 'a.a.a', 80, '/x/x.html', 'name'], x)
        end

        def test_url
            assert_exception(UnhandledSchemeError) {Page.url('mailto:niklas@kagi.com')}
            assert_no_exception {Page.url('http://www.acc.umu.se/')}
            x = Page.url('http://a.a.a:1200/x/x.html')
            assert_equal('http://a.a.a:1200/x/x.html', x.url)
            x = Page.url('http://a.a.a:1200/x/x.html#test')
            assert_equal('test', x.name)
        end

        def test_link
            p1 = Page.url('http://a.a.a/x/x.html', 'proxy', 8080)
            p2 = p1.link('y/y.html')
            assert_equal(['a.a.a', '/x/y/y.html', 'proxy', 8080],
                [p2.host, p2.path, p2.proxy_host, p2.proxy_port])
            p2 = p1.link('/y/y.html')
            assert_equal('http://a.a.a/y/y.html', p2.url)
            p2 = p1.link('http://b.b.b/r.html')
            assert_equal('http://b.b.b/r.html', p2.url)
            assert_exception(UnhandledSchemeError) {p1.link('mailto:niklas@kagi.com')}
            p2 = p1.link('a.gif', 'img', {'src' => 'a.gif'})
            assert_equal(['img', {'src' => 'a.gif'}], [p2.tag, p2.attr])
            p2 = p1.link('/ads/')
            assert_equal('/ads/', p2.path)
            p2 = p1.link('http://b.b.b')
            assert_equal('/', p2.path)
            p2 = p1.link('#test')
            assert_equal('test', p2.name)
            assert_equal(p1.url, p2.url)
        end

        def test_html
            assert(Page.url('http://www.google.com/index.html').html?)
            assert(!Page.url('http://www.google.com/images/logo.gif').html?)
        end

        def test_image?
            assert(Page.url('http://www.google.com/images/logo.gif').image?)
        end

        def test_dirname
            p = Page.url('http://aa.aa.aa/x/x.html')
            assert_equal('/x', p.dirname)
            assert_equal('x.html', p.basename)
        end

        def test_tree_path
            p = Page.url('http://aa.aa.aa/x/x.html')
            assert_equal('aa.aa.aa/x/x.html', p.tree_path)
        end

        def test_relative_path
            abc = File.join('a', 'b', 'c.html')
            adb = File.join('a', 'd', 'b.html')
            ab =  File.join('a', 'b.html')
            assert_equal('../d/b.html', Page.relative_path(abc, adb))
            assert_equal('b/c.html', Page.relative_path(ab, abc))
            assert_equal('../b.html', Page.relative_path(abc, ab))
            assert_equal('', Page.relative_path(ab, ab))
        end

        def test_translate
            p = Page.url('http://a.a.a/x.html')
            x = p.translate_href('y.html', '/x', {}, nil)
            assert_equal(nil, x)
            x = p.translate_href('y.html', '/x')
            assert_equal('http://a.a.a/y.html', x)
            x = p.translate_href('y.html', '/x', {'http://a.a.a/y.html' => '/y'})
            assert_equal('y', x)
            x = p.translate_href('y.html', '/a/b.html',
                {'http://a.a.a/y.html' => '/c/d.html'})
            assert_equal('../c/d.html', x)
            x = p.translate_href('#ho', '/a/b.html',
                {'http://a.a.a/x.html' => '/a/b.html'})
            assert_equal('#ho', x)
            x = p.translate_href('y.html#ho', '/x',
                {'http://a.a.a/y.html' => '/y'})
            assert_equal('y#ho', x)
        end
    end

    # Tests that require a network connection
    # This thingamajingys are a bit hard to test.
    class Testing_Network < RUNIT::TestCase
        def test_fetch
            p = Page.url('http://www.google.com')
            p.error_handler = nil
            assert_no_exception {p.fetch}
            p = Page.url('http://this.webpage.should.not.exist')
            p.error_handler = nil
            assert_exception(SocketError) {p.fetch}
        end

        # This is tricky, no web page is really stable, we fetch the ruby-lang
        # page and make sure that it contains the word Ruby. If not, something
        # should be wrong.
        def test_content
            p = Page.url('http://www.ruby-lang.org/en/index.html')
            assert(p.content['Ruby'], "Checking for Ruby on ruby web page")
        end

        def test_html
            assert(Page.url('http://www.google.com/').html?)
        end

        def test_true_path
            p = Page.url('http://www.acc.umu.se/')
            assert_equal('/', p.path)
            assert_equal('/index.html.en', p.true_path)
            assert_equal('/index.html.en', p.path)
        end

        def test_extract
            # Test that Ruby English links to Ruby Japanese
            p = Page.url('http://www.ruby-lang.org/en/')
            x = p.extract
            assert(x.include?(Page.url('http://www.ruby-lang.org/ja')))
            # But not if we only want links in current subdirectory
            x = p.extract(:subdir)
            assert(!x.include?(Page.url('http://www.ruby-lang.org/ja')))
        end
    end
            
    RUNIT::CUI::TestRunner.run(Testing_TagParser.suite)
    RUNIT::CUI::TestRunner.run(Testing_Page.suite)
    #RUNIT::CUI::TestRunner.run(Testing_Network.suite)
end

# vim: set sw=4 :
