# WWW::Extractor # facade of functions over the WWW::Extractor::* # web-page navigation and processing objects # Sam Watkins # version 1.1 package WWW::Extractor; $VERSION = '1.1'; use strict; use vars qw($VERSION @ISA %EXPORT_TAGS @document_facade @document_facade_can_die $agent $content $document); use Carp; use WWW::Extractor::Generic::Table; use WWW::Extractor::UserAgent; use WWW::Extractor::HTML; use WWW::Extractor::Text; use WWW::Extractor::XML::Predicates qw(:all); use URI::Escape; @ISA = qw(Exporter AutoLoader); # these facade functions are built automatically! @document_facade = qw(test next prev position reset); @document_facade_can_die = qw(find out enter form_data select read_line read_lines); %EXPORT_TAGS = ( all=>[ @{$WWW::Extractor::XML::Predicates::EXPORT_TAGS{all}}, @document_facade, @document_facade_can_die, qw(escape http_verbose http_simple http_proxy cookie get post open_url save load_html load_text set_html set_text content document agent follow_link submit read_table), ] ); Exporter::export_ok_tags(qw(all)); $agent = new WWW::Extractor::UserAgent; sub escape { uri_escape(shift, "^A-Za-z0-9"); } sub http_proxy { $agent->proxy('http', shift); } sub http_verbose { $agent->verbose(@_); } sub http_simple { $agent->simple(@_); } # set a cookie: call with $version, $key, $val, $path, $domain, [ $port, $path_spec, $secure, $maxage, $discard, \%rest ] sub cookie { $agent->cookie_jar->set_cookie(@_); } sub parse { $document = new WWW::Extractor::HTML; $document->parse($content) or croak 'cannot parse document'; &reset; } sub get { open_url(@_); } sub post { push @_, [] if @_ == 1; open_url(@_, 'POST'); } sub open_url { $content = $agent->open(@_) or croak "cannot open website: $_[0]"; parse; } sub save { my $filename = shift; local *FILE; open FILE, ">$filename" or croak 'cannot save file'; print FILE &content; close FILE; } sub load_html { set_html(load(@_)); } sub load_text { set_text(load(@_)); } # common loading of text - this is not public sub load { my $filename = shift; local ($/, *FILE); open FILE, $filename or croak 'cannot load file'; my $content = \; return $content; } sub set_text { my $text = shift; undef $document; $content = new WWW::Extractor::Text(ref $text ? $$text : $text); } sub set_html { my $text = shift; $content = ref $text ? $text : \$text; parse; } sub follow_link { my $test = shift; my $url = $document->link($test) or croak "cannot follow link : $test"; open_url($url); } sub submit { my ($url, $query, $method) = $document->submit(@_) or croak 'cannot submit form'; open_url($url, $query, $method) } sub read_table { my $table; # extract a table from the HTML, or from plain text if not an HTML document $table = ($document or $content)->read_table(@_) or croak 'cannot read table'; $table->trim; return $table; } sub content { $$content } sub document { $document } sub agent { $agent } # facade over document and textual content - generates subroutines automatically! # note - this wont work for functions returning arrays at the moment. for my $sub (@document_facade) { eval <$sub(\@_); } End } for my $sub (@document_facade_can_die) { eval <$sub(\@_); \@ret or croak "cannot '$sub' on document"; return \@ret; } else { my \$ret = (\$document or \$content)->$sub(\@_); \$ret or croak "cannot '$sub' on document"; return \$ret; } } End } 1 __END__ =head1 NAME WWW::Extractor - navigate through webpages, extract fields and tables from webpages and plain text. =head1 SYNOPSIS use WWW::Extractor; webpage navigation: http_proxy('http://proxy.schools.net.au:3128'); open_url 'http://www.reporting.net/networks/affiliates/bf_login', [username_in => 'schoolsnet', password_in => 'dermott']; follow_link 'mp3'; enter 'Reseller ID', 377; select 'Browser'; submit; accessing content: load_text 'amazon.txt'; load_html 'chaosmusic.html'; save 'chaosmusic.html'; print content; $document = document; document navigation: instance methods field extraction: ($from_date, $to_date) = read_line 'For the week of', 'through'; ($nref, $nret, $amount) = read_lines 'Number of people that have referred and entered this site:', 'Number of people that have returned after a referral:', 'Amount Sold (AUS$):'; table extraction: $table = read_table 'Report of Sales'; $table = read_table <<'End'; ITEM CODE, HITS, TITLE, DIR, NDIR, YOUR FEE, DISCOUNT, LIST PRICE , ---------- ------ ----- ----- ------- -------------------------------------- , ########## ###### @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ ##### ##### ####### sold at ##% off list price of ############# , ---------- ------ ----- ----- ------- -------------------------------------- End table processing: drop_head_foot $table 0, 2; cut $table 'Transaction Date', '# Orders', 'Net Sales'; cut $table 2..6; =head1 DESCRIPTION The extractor module is a facade on top of a group of other modules, it makes it easy to write scripts to extract data from webpages and plain text documents. It is not well documented yet. =head1 AUTHOR Sam Watkins =head1 SEE ALSO WWW::Extractor::UserAgent, WWW::Extractor::HTML, WWW::Extractor::Text, WWW::Extractor::Generic::Table, WWW::Extractor::Generic::Predicates, WWW::Extractor::XML::Predicates =cut