use LWP;
sub get_from_web{
$url = shift ;
my $bot = LWP::UserAgent->new ;
$bot->agent("Mozilla Firefox 4.0") ;
my $response = $bot->get( $url ) ;
die "Can't get $url -- " , $response->status_line unless $response->is_success;
$html_content = $response->content ;
# print "$html_content\n\n";
return $html_content ;
}
sub add_to_index{
$url = shift ;
$html = shift ;
my $content;
if( $html =~ m/\
(.*?)\<\/body\>/s ){ # Reg Ex to get content within
$body = $2;
if( $body =~ m/\(.*?)\<\/p\>/s ){ # Reg Ex to get anything within
$content = $1 ;
# clean text - 1. Make everythin LowerCase , remove
's , \n's and multiple spaces
$content = lc( $content ) ;
$content =~ s/\
//g ;
$content =~ s/\n//g ;
$content =~ s/ +/ /g ;
}
}
my @words = split( / / , $content );
foreach $word( @words ){
$word =~ s/[\.\,]//g;
print "$word\n";
if( $index{ $word } ){
push @{$index{ $word }} , $url ;
}else{
my @array ;
push @array , $url ;
$index{ $word } = \@array ;
}
}
dump_hash( \%index ) ;
print "\n\n\n\n\n" ;
}
sub query_in_string{
my $match = shift ;
my @words = @_ ;
my $count = 0 ;
foreach my $word( @words ){
# print "$word :: $match \n";
if( $word eq $match ){
$count++ ;
}
}
return $count ;
}
sub search{
$query = shift ;
@words_in_query = split( / / , $query );
my %result;
foreach $word( keys %index ){
my $match_count = query_in_string( $word, @words_in_query );
# print "$word :: $match_count \n";
if( $match_count > 0 ){
foreach $url( @{ $index{ $word } } ){
if( $result{ $url } ){
$result{ $url } += $match_count ;
}else{
$result{ $url } = $match_count ;
}
}
}
}
foreach $word( keys %result ){
print " $word :: ". $result{ $word } . " \n";
}
foreach $url(sort {$result{$b} cmp $result{$a} } keys %result){
print "\n\n$url\n" ;
$y = `open -a firefox $url` ;
last ;
}
}
sub dump_hash{
$hash_ref = shift ;
%hash = %{ $hash_ref } ;
foreach $key( keys %hash ){
print "[ ".$key." ] => ( ";
foreach $url( @{ $hash{ $key } } ){
print "$url , ";
}
print" )\n";
}
}
######################### MAIN #####################################
my $input = $ARGV[0] ;
my @urls ;
our %index ;
$urls[0] = "http://localhost/~vikram/codechef/page1.htm" ;
$urls[1] = "http://localhost/~vikram/codechef/page2.htm" ;
$urls[2] = "http://localhost/~vikram/codechef/page3.htm" ;
$urls[3] = "http://localhost/~vikram/codechef/page4.htm" ;
foreach $url( @urls ){
$html_content = get_from_web( $url ) ;
add_to_index( $url , $html_content ) ;
}
search( $input ) ;