# (C) 2003 Dan Lenski # package WWW::Google::News; use strict; use CGI::Util qw(escape); use HTTP::Request::Common; use LWP::UserAgent; use XML::RSS; use POSIX qw(strftime); 1; sub new { my $class = shift; my $self = bless {}, ref $class || $class; $self->{ua} = LWP::UserAgent->new(timeout => 10, agent => "Elinks (0.3; Unix)"); $self->{results} = undef; $self->{url} = undef; $self->{query} = undef; $self->{lastbuilddate} = undef; $self; } sub results { my $self = shift; $self->{results}; } sub resultsRSS { my $self = shift; my $rss = XML::RSS->new(version => '0.91'); $rss->channel( title => "Google News Search: $self->{query}", link => $self->{url}, description => "Google News Search: $self->{query}", pubDate => $self->{lastbuilddate}, webMaster => 'moxfyre@geocities.com' ); foreach ( @{$self->{results}} ) { $rss->add_item( title => $_->{title}, link => $_->{link}, description => $_->{desc} ); } $rss->as_string; } sub search { my $self = shift; my ($query, $num) = @_; ############################################## # Screen-scraping code begins my $url = "http://news.google.com/news?scoring=d&hl=en&q=" . escape($query) . "&num=" . escape($num); my $item_re = qr(]+)>(.+?)
(?: ([^<]+)  - ([^<]+)
)? (.*?)
)sx; $self->{query} = $query; $self->{url} = $url; $self->{lastbuilddate} = strftime('%a, %e %b %Y %H:%M:%S %Z', gmtime); # get Google news page my $response = $self->{ua}->request(GET($url)); return undef unless $response->is_success; $response->content =~ m|
(.*)
|s; my $content = $1; my $results = $self->{results} = []; while ($content =~ /$item_re/g) { my ($link, $title, $source, $when, $desc) = ($1,$2,$3,$4,$5); foreach ($title, $desc) { s[<.+?>][]g; tr[\n][ ]; s/^\s+//; s/\s+$// } push @$results, { link => $link, title => $title, source => $source, when => $when, desc => $desc }; } # Screen-scraping code ends ############################################## return $results; }