# WWW::Extractor
# facade of functions over the WWW::Extractor::*
# web-page navigation and processing objects
# Sam Watkins
# version 1.1

package WWW::Extractor;

$VERSION = '1.1';

use strict;
use vars qw($VERSION @ISA %EXPORT_TAGS @document_facade @document_facade_can_die $agent $content $document);
use Carp;

use WWW::Extractor::Generic::Table;
use WWW::Extractor::UserAgent;
use WWW::Extractor::HTML;
use WWW::Extractor::Text;
use WWW::Extractor::XML::Predicates qw(:all);
use URI::Escape;

@ISA = qw(Exporter AutoLoader);

# these facade functions are built automatically!
@document_facade =
qw(test next prev position reset);

@document_facade_can_die =
qw(find out enter form_data select read_line read_lines);

%EXPORT_TAGS = ( all=>[
    @{$WWW::Extractor::XML::Predicates::EXPORT_TAGS{all}},
    @document_facade,
    @document_facade_can_die,
    qw(escape http_verbose http_simple http_proxy cookie
       get post open_url save load_html load_text
       set_html set_text
       content document agent
       follow_link submit read_table),
] );
Exporter::export_ok_tags(qw(all));

$agent = new WWW::Extractor::UserAgent;

sub escape {
    uri_escape(shift, "^A-Za-z0-9");
}

sub http_proxy {
    $agent->proxy('http', shift);
}

sub http_verbose {
    $agent->verbose(@_);
}

sub http_simple {
    $agent->simple(@_);
}

# set a cookie: call with $version, $key, $val, $path, $domain, [ $port, $path_spec, $secure, $maxage, $discard, \%rest ]
sub cookie {
    $agent->cookie_jar->set_cookie(@_);
}

sub parse {
    $document = new WWW::Extractor::HTML;
    $document->parse($content)
	or croak 'cannot parse document';
    &reset;
}

sub get {
    open_url(@_);
}

sub post {
    push @_, [] if @_ == 1;
    open_url(@_, 'POST');
}

sub open_url {
    $content = $agent->open(@_)
	or croak "cannot open website: $_[0]";
    parse;
}

sub save {
    my $filename = shift;
    local *FILE;
    open FILE, ">$filename"
	or croak 'cannot save file';
    print FILE &content;
    close FILE;
}

sub load_html {
    set_html(load(@_));
}

sub load_text {
    set_text(load(@_));
}

# common loading of text - this is not public
sub load {
    my $filename = shift;
    local ($/, *FILE);
    open FILE, $filename
	or croak 'cannot load file';
    my $content = \<FILE>;
    return $content;
}

sub set_text {
    my $text = shift;
    undef $document;
    $content = new WWW::Extractor::Text(ref $text ? $$text : $text);
}

sub set_html {
    my $text = shift;
    $content = ref $text ? $text : \$text;
    parse;
}

sub follow_link {
    my $test = shift;
    my $url = $document->link($test)
	or croak "cannot follow link : $test";
    open_url($url);
}

sub submit {
    my ($url, $query, $method) = $document->submit(@_)
	or croak 'cannot submit form';
    open_url($url, $query, $method)
}

sub read_table {
    my $table;

    # extract a table from the HTML, or from plain text if not an HTML document

    $table = ($document or $content)->read_table(@_)
	or croak 'cannot read table';
    $table->trim;
    return $table;
}

sub content { $$content }

sub document { $document }

sub agent { $agent }

# facade over document and textual content - generates subroutines automatically!
# note - this wont work for functions returning arrays at the moment.

for my $sub (@document_facade) {
    eval <<End;
sub $sub {
    return (\$document or \$content)->$sub(\@_);
}
End
}

for my $sub (@document_facade_can_die) {
    eval <<End;
sub $sub {
    if (wantarray) {
	my \@ret = (\$document or \$content)->$sub(\@_);
	\@ret or croak "cannot '$sub' on document";
	return \@ret;
    } else {
	my \$ret = (\$document or \$content)->$sub(\@_);
	\$ret or croak "cannot '$sub' on document";
	return \$ret;
    }
}
End
}

1
__END__

=head1 NAME

WWW::Extractor - navigate through webpages, extract fields and tables from webpages and plain text.

=head1 SYNOPSIS

  use WWW::Extractor;

webpage navigation:
  http_proxy('http://proxy.schools.net.au:3128');
  open_url 'http://www.reporting.net/networks/affiliates/bf_login',
           [username_in => 'schoolsnet', password_in => 'dermott'];
  follow_link 'mp3';
  enter 'Reseller ID', 377;
  select 'Browser';
  submit;

accessing content:
  load_text 'amazon.txt';
  load_html 'chaosmusic.html';
  save 'chaosmusic.html';
  print content;
  $document = document;

document navigation:
  instance methods 

field extraction:
  ($from_date, $to_date) = read_line
      'For the week of', 'through';

  ($nref, $nret, $amount) = read_lines
      'Number of people that have referred and entered this site:',
      'Number of people that have returned after a referral:',
      'Amount Sold (AUS$):';

table extraction:
  $table = read_table 'Report of Sales';

  $table = read_table <<'End';
ITEM CODE, HITS,                      TITLE,
                  DIR,  NDIR, YOUR FEE,       DISCOUNT,             LIST PRICE
,
---------- ------ ----- ----- ------- --------------------------------------
,
########## ######                      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
                  ##### ##### #######    sold at ##% off list price of #############
,
---------- ------ ----- ----- ------- --------------------------------------
End

table processing:
  drop_head_foot $table 0, 2;
  cut $table 'Transaction Date', '# Orders', 'Net Sales';
  cut $table 2..6;

=head1 DESCRIPTION

The extractor module is a facade on top of a group of other modules, it makes it easy
to write scripts to extract data from webpages and plain text documents.

It is not well documented yet.

=head1 AUTHOR

Sam Watkins

=head1 SEE ALSO

WWW::Extractor::UserAgent, WWW::Extractor::HTML, WWW::Extractor::Text, WWW::Extractor::Generic::Table,
WWW::Extractor::Generic::Predicates, WWW::Extractor::XML::Predicates

=cut
