use LWP; sub get_from_web{ $url = shift ; my $bot = LWP::UserAgent->new ; $bot->agent("Mozilla Firefox 4.0") ; my $response = $bot->get( $url ) ; die "Can't get $url -- " , $response->status_line unless $response->is_success; $html_content = $response->content ; # print "$html_content\n\n"; return $html_content ; } sub add_to_index{ $url = shift ; $html = shift ; my $content; if( $html =~ m/\(.*?)\<\/body\>/s ){ # Reg Ex to get content within $body = $2; if( $body =~ m/\(.*?)\<\/p\>/s ){ # Reg Ex to get anything within

$content = $1 ; # clean text - 1. Make everythin LowerCase , remove
's , \n's and multiple spaces $content = lc( $content ) ; $content =~ s/\//g ; $content =~ s/\n//g ; $content =~ s/ +/ /g ; } } my @words = split( / / , $content ); foreach $word( @words ){ $word =~ s/[\.\,]//g; print "$word\n"; if( $index{ $word } ){ push @{$index{ $word }} , $url ; }else{ my @array ; push @array , $url ; $index{ $word } = \@array ; } } dump_hash( \%index ) ; print "\n\n\n\n\n" ; } sub query_in_string{ my $match = shift ; my @words = @_ ; my $count = 0 ; foreach my $word( @words ){ # print "$word :: $match \n"; if( $word eq $match ){ $count++ ; } } return $count ; } sub search{ $query = shift ; @words_in_query = split( / / , $query ); my %result; foreach $word( keys %index ){ my $match_count = query_in_string( $word, @words_in_query ); # print "$word :: $match_count \n"; if( $match_count > 0 ){ foreach $url( @{ $index{ $word } } ){ if( $result{ $url } ){ $result{ $url } += $match_count ; }else{ $result{ $url } = $match_count ; } } } } foreach $word( keys %result ){ print " $word :: ". $result{ $word } . " \n"; } foreach $url(sort {$result{$b} cmp $result{$a} } keys %result){ print "\n\n$url\n" ; $y = `open -a firefox $url` ; last ; } } sub dump_hash{ $hash_ref = shift ; %hash = %{ $hash_ref } ; foreach $key( keys %hash ){ print "[ ".$key." ] => ( "; foreach $url( @{ $hash{ $key } } ){ print "$url , "; } print" )\n"; } } ######################### MAIN ##################################### my $input = $ARGV[0] ; my @urls ; our %index ; $urls[0] = "http://localhost/~vikram/codechef/page1.htm" ; $urls[1] = "http://localhost/~vikram/codechef/page2.htm" ; $urls[2] = "http://localhost/~vikram/codechef/page3.htm" ; $urls[3] = "http://localhost/~vikram/codechef/page4.htm" ; foreach $url( @urls ){ $html_content = get_from_web( $url ) ; add_to_index( $url , $html_content ) ; } search( $input ) ;