A Web Crawler in Perl

#! /usr/bin/perl
#
# spider.pl   Set tabstops to 3.
#

$| = 1;
# 0=no debug, 1=display progress, 2=complete dump
$DEBUG = 0;
# Check hyperlinks to other hosts?
$SPANHOSTS = "off";

if(scalar(@ARGV) < 2){
print "Usage: $0 <fully-qualified-URL> <search-phrase>\n";
exit 1;
}

# Initialize.
%URLqueue = ();
chop($client_host=`hostname`);
$been = 0;
$search_phrase = $ARGV[1];

# Load the queue with the first URL to hit.
$URLqueue{$ARGV[0]} = 0;
$thisURL = &find_new(%URLqueue);

# While there's a URL in our queue which we haven't looked at ...
while($thisURL ne ""){

# Progress report.
$count = 0;
while(($key,$value) = each(%URLqueue)){
   $count ++;
}
print "-----------------------------------------\n" if($DEBUG>=1);
printf("Been: %d  To Go: %d\n", $been, $count-$been)
if($DEBUG>=1);
print "Current URL: $thisURL\n" if($DEBUG>=1);
&dump_stack() if($DEBUG>=2);

# Split the protocol from the URL.
($protocol, $rest) = $thisURL =~ m|^([^:/]*):(.*)$|;

# If the protocol is http, fetch the page and process it.
if($protocol eq "http"){

   # Split out the hostname, port and document.
   ($server_host, $port, $document) =
      $rest =~ m|^//([^:/]*):*([0-9]*)/*([^:]*)$|;

   # Get the page of text and remove CR/LF characters and HTML
   # comments from it.
   $page_text = &get_http($client_host, $server_host, $port,
      $document);
   $page_text =~ tr/\r\n//d;
   $page_text =~ s|<!--[^>]*-->||g;

   # Report if our search string is found here.
   if($page_text =~ m|$search_phrase|i){
      print "$thisURL\n"
   }

   # Find anchors in the HTML and update our list of URLs..
   (@anchors) = $page_text =~ m|<A[^>]*HREF\s*=\s*"([^
">]*)"|gi;
   foreach $anchor (@anchors){
      $newURL = &fqURL($thisURL, $anchor);
      if($URLqueue{$newURL} > 0){

         # Increment the count for URLs we've already
         # checked out.
         $URLqueue{$newURL}++;

      }else{

         # Add a zero record for URLs we haven't
         # encountered.
         # Optionally, ignore URL's which point to other
         # hosts.
         ($new_host) =
            $newURL =~ m|^[^:/]*:/*([^/:]*):*[0-9]*/*[^:]*$|;
         if($SPANHOSTS eq "on" || $new_host eq
            $server_host){
            $URLqueue{$newURL}=0;
         }
      }
   }
}else{
   print "Protocol '$protocol' ignored.\n" if($DEBUG>=1);
}

# Record the fact that we've been here, and get a new URL to process.
$URLqueue{$thisURL} ++;
$been ++;
$thisURL = &find_new(%URLqueue);

}
exit;

#--------------------------------------------------------------
# Build a fully specified URL.
#--------------------------------------------------------------
sub fqURL
{
local($thisURL, $anchor) = @_;
local($has_proto, $has_lead_slash, $currprot, $currhost, $newURL);

# Strip anything following a number sign '#', because its
# just a reference to a position within a page.
$anchor =~ s|^.*#[^#]*$|$1|;

# Examine anchor to see what parts of the URL are specified.
$has_proto = 0;
$has_lead_slash=0;
$has_proto = 1 if($anchor =~ m|^[^/:]+:|);
$has_lead_slash = 1 if ($anchor =~ m|^/|);

if($has_proto == 1){

   # If protocol specified, assume anchor is fully qualified.
   $newURL = $anchor;

}
elsif($has_lead_slash == 1){

   # If document has a leading slash, it just needs protocol and host.
   ($currprot, $currhost) = $thisURL =~ m|^([^:/]*):/+([^:/]*)|;
   $newURL = $currprot . "://" . $currhost . $anchor;

}
else{

   # Anchor must be just relative pathname, so append it to current URL.
   ($newURL) = $thisURL =~ m|^(.*)/[^/]*$|;
   $newURL .= "/" if (! ($newURL =~ m|/$|));
   $newURL .= $anchor;

}
if($DEBUG >=2){
   print "Link Found\n   In:$thisURL\n   Anchor:$anchor\n   Result: $newURL\n"
}
return $newURL;
}

#---------------------------------------------------------------
# Do a linear search of the URL stack to find a URL with a data
# value of 0 (i.e. one we haven't checked out yet).
#---------------------------------------------------------------
sub find_new
{
local(%URLqueue) = @_;
local($key, $value);

while(($key, $value) = each(%URLqueue)){
   return $key if($value == 0);
}
return "";
}

#-------------------------------------------------------------------
# Debugging utility.
#-------------------------------------------------------------------
sub dump_stack
{
local($key, $x);
local($done, $togo) = ("", "");

foreach $key (keys(%URLqueue)){
   if($URLqueue{$key} == 0){
      $togo .= "  " . $key . "\n";
   }else{
      $done .= "  " . $key . " (hitcount = "
          . $URLqueue{$key} . ")\n";
   }
}

print "Been There:\n" . $done;
print "To Go:\n" . $togo;
print "------- Hit Q to Quit, Enter to Continue -------\n";
read(STDIN, $key, 1);
exit(1) if($key eq 'Q' || $key eq 'q');
}

#-------------------------------------------------------------------------
# Get the page indicated by the $server_host and $document parameters.
#-------------------------------------------------------------------------
sub get_http
{
local($client_host, $server_host, $port, $document) = @_;
local($name,$aliases,$type,$len);
local($this,$thisaddr,$that,$thataddr);
local($client_host, $sockaddr, $a,$b,$c,$d);
local($page, $header, $header_text, $content);

# Some constants used to access the TCP network.
$AF_INET=2;
$SOCK_STREAM=1;

# Use default http port if none specified.
$port = 80 if($port == 0);

# Get the protocol number for TCP.
($name,$aliases,$proto)=getprotobyname("tcp");

# Get the IP addresses for the two hosts.
($name,$aliases,$type,$len,$thisaddr) = gethostbyname($client_host);
($name,$aliases,$type,$len,$thataddr) = gethostbyname($server_host);

# Check we could resolve the server host name.
($a,$b,$c,$d) = unpack('C4', $thataddr);
if($a eq "" && $b eq "" && $c eq  "" && $d eq ""){
   print "ERROR: Unknown host $server_host.\n";
   return "";
}
print "Server: $server_host ($a.$b.$c.$d)\n" if($DEBUG>=2);

# Pack the AF_INET magic number, the port, and the (already packed) IP
# addresses into the same format as the C structure would use. Note
# this is architecture dependent: this pack format works for 32 bit
# architectures.
$sockaddr="S n a4 x8";
$this=pack($sockaddr, $AF_INET, 0, $thisaddr);
$that=pack($sockaddr, $AF_INET, $port, $thataddr);

# Create the socket and connect.
if(socket(S, $AF_INET, $SOCK_STREAM, $proto) == false){
   print "ERROR: Cannot create socket.\n";
   return "";
}
print "Socket OK\n" if($DEBUG>=2);
if(connect(S, $that) == false){
   print "ERROR: Cannot connect to server $server_host,
       port $port.\n";
   return "";
}
print "Connect OK\n" if($DEBUG>>>>>>>>=2);

# Turn buffering in the socket off, and send request to the server.
select(S); $| = 1; select(STDOUT);
print S "GET /$document HTTP/1.0\n\n";

# Receive the response. Check to ensure the response is of MIME
# type text/html or text/plain.
$page = "";
$header = 1;
$header_text = "";
while(<S>){

   # Check if we've hit the end of the HTTP header (an empty
line).
   # If we have, check for a content-type header line, and
ensure
   # it is valid.
   if( m|^[\n\r]*$| ){
      $header = 0;
      ($content) = $header_text =~ m|Content-type: (\S+)|i;
      if($content ne "text/html" && $content ne "text/plain"){
         print "Content type '$content' ignored.\n"
            if($DEBUG>=1);
         last;
      }
   }
   # Save to a header string if we're still working on the HTTP
   # header.
   elsif($header == 1){
      $header_text .= "   " . $_;
   }
   # Otherwise, save to the html page string.
   else{
      $page .= $_;
   }

print "HTTP header: \n $header_text" if($DEBUG>=2);
return $page;
}
Archive Index Issue Table of Contents Article