# XML::Document
# - represents an XML document
# - keeps track of elements in an array
# - keeps a list of Cursors, which are kept valid even when the document is modified

package WWW::Extractor::XML::Document;

use strict;
use vars qw(@ISA);

use WWW::Extractor::Generic::List;
use WWW::Extractor::XML::Element;
use WWW::Extractor::XML::Cursor;
use WWW::Extractor::XML::DocumentType;
use HTML::TokeParser;
use WWW::Extractor::Generic::Predicates qw(:all);
use WWW::Extractor::XML::Predicates qw(:all);


@ISA = qw|WWW::Extractor::Generic::List|;

sub new {
    my $class = shift;
    my ($doctype) = @_;
    my $self = SUPER::new $class();
    $self->{marks}{cursor} = new WWW::Extractor::XML::Cursor;
    $self->{doc_type} = $doctype || new WWW::Extractor::XML::DocumentType;
    return $self;
}

# the insert method makes sure that tags are balanced (and hacks
# around the cursor a little to do so)
sub insert {
    my ($self, $e) = @_;
    # text elements are trimmed, and omitted if null
    if ($e->type eq 'T') {
	# trim the text
	for ($e->{text}) {
	    s/\n+$//;
	    s/^\n+//;
	}
	# omit if null
	return if $e->{text} eq '';
    }

    if ($e->type eq 'S' || $e->type eq 'E') {
	my $tag = $e->tag;
	# register as a container if it's unknown
	$self->{doc_type}->register_container($tag);
    }
    
    $self->SUPER::insert($e);

    # this calls next on the cursor, thus dropping the newly added
    # element from the stack
}

# parse text and insert the elements into the document
# takes a reference to a string
sub parse {
    my ($self, $content) = @_;

    # remove return characters from content if we have lf|cr or cr|lf
    for ($$content) {
	s/\r//g if /\n\r|\r\n/;
    }

    my $tokenizer = new HTML::TokeParser($content);

    # first pass - read in the document and note any previously unknown tags
    # adjacent 'text' elements are joined together

    my @tokens = ();

    my $last_was_text = 0;
    while (defined (my $token = $tokenizer->get_token)) {
	push @tokens, $token;
	my $type = $token->[0];
	# check for new types of start or end tags -
	# we're careful not to overwrite existing weights
	if ($type eq 'S') {
	    my $tag = $token->[1];
	    $self->{doc_type}->register_lonely($tag);
	    # a new type of tag is lonely, until proved otherwise!
	} elsif ($type eq 'E') {
	    my $tag = $token->[1];
	    $self->{doc_type}->register_container($tag);
	    # if we've seen the end tag, we know it is a container
        } elsif ($type eq 'T') {
	    if ($last_was_text) {
	        my $text_el = pop @tokens;
		$tokens[-1]->[1] .= $text_el->[1];
	    }
        }
	$last_was_text = $type eq 'T';
    }

    # second pass - create XML::Element objects and insert into the document
    # lonely tags are differentiated from start tags

    for my $token (@tokens) {
	$token->[0] = 'L'
	    if $token->[0] eq 'S' && $self->{doc_type}->tag_weight($token->[1]) == 0;
	my $e = new WWW::Extractor::XML::Element(@$token);
	$self->insert($e);
    }

    1
}

#sub as_string {} - indents containers
#we really should be using streams or something, not writing straight to a string

sub as_string {
    my $self = shift;
    my $str = '';

    my $old_pos = $self->mark;

    $self->reset;
    while ($self->next) {
        $str .= "\t" x @{$self->cursor->{stack}} .  $self->item->as_string . "\n";
    }

    $self->goto($old_pos);
    $self->forget($old_pos);
    return $str;
}

sub find {
    my ($self, $test, @rest) = @_;
    return $self->SUPER::find(XMLWrap($test), @rest);
}

# this is a bit dodgy at the moment...
sub container_start {
    my ($self, $tag, $test) = @_;

    if (defined $test) {
	$test = In($tag, $test);
	$self->find($test) or return;
    }

    $tag = Tag($tag); # should be Start, but one vendor has broken form tags

    # move out to the <form> tag if necessary
    return $self->test($tag) || $self->out($tag) || $self->find($tag);
}

# read multiple fields from different lines


# end_test
# pre: current element is a start tag (need not 'has_end')
# returns: a function which tests if a certain element is the end of the container
# - will tolerate badly formed html,
# e.g, <td>fred<td>wilma

sub end_test {
    my $self = shift;
    my $start = $self->item;

    return Not(In($start)) if $start->has_end;

    # find next heavier-or-same component on stack
    my $stack = $self->cursor->{stack};
    my $start_weight = $self->{doc_type}->tag_weight($start->tag);

    my $outer = undef;

    for (my $i = $#{$stack}; $i>=0; --$i) {
	$outer = $stack->[$i];
	last if ($self->{doc_type}->tag_weight($outer->tag) >= $start_weight);
    }

    return Eq(undef) unless defined $outer;

    return Or(Not(In($outer)), And(Under($outer), Tag($start->tag)));
}

1
