#!/usr/bin/perl

# striptag -- strip given tags out of HTML
# Copyright       : http://www.fsf.org/copyleft/gpl.html
# Created On      : Summer 2003
# Last Modified On: Thu Aug  5 05:59:14 2004
# Update Count    : 17
# Nominal Author  : Dan Jacobson -- http://jidanni.org/
# Actual brains   : Jonathan Stowe -- http://www.gellyfish.com/
# modified by Sam Watkins

use warnings; use strict;

use HTML::Entities;

# only tags in this list will get through
my $ok_tags = set(qw(
	html head body
	title base
	p br hr
	a img
	table th tr td
	b i u em strong
	center
	blockquote
	ul ol li dl dt dd
	h1 h2 h3 h4 h5 h6
	pre
));

# attributes in this list will get through no matter what tag they are in
# - I can't think of any yet.
my $ok_attr = set(qw(
));

# a list of allowed attributes for each tag
my $ok_tag_attr = {
	td => set(qw(colspan rowspan)),
	a => set(qw(href name)),
	img => set(qw(src width height alt title)),
	base => set(qw(href)),
};

# a list of tags where we want to hide the content between <foo> and </foo>
my $kill_containers = set(qw(
	script style
));



use strict;
use warnings;
use HTML::Parser;

my $in_dead_container = 0;

my $parser = HTML::Parser->new(
	text_h    => [ \&text,	"text" ],
#	no comments will get through, only text, start and end tags
#	default_h => [ sub    { print shift },        'text' ],
	start_h   => [ \&start_tag, "self, tagname, attrseq, attr" ],
	end_h     => [ \&end_tag, "self, tagname, text" ]
);
$parser->parse_file(*STDIN);

sub text {
	print shift unless $in_dead_container;
}

sub start_tag {
	my ( $self, $tag, $attrseq, $attr ) = @_;

	if ($ok_tags->{$tag} && !$in_dead_container) {
		$attrseq = [grep {$ok_attr->{$_} || $ok_tag_attr->{$tag}{$_}} @$attrseq];
		print format_start_tag($tag, $attrseq, $attr);
	}
	if ($kill_containers->{$tag}) {
		++$in_dead_container;
	}
}

sub end_tag {
	my ( $self, $tag, $text ) = @_;

	print $text if $ok_tags->{$tag} && !$in_dead_container;

	if ($kill_containers->{$tag}) {
		--$in_dead_container;
	}
}

sub set {
	return {map { $_, 1 } @_};
}

sub format_start_tag {
	my ($tag, $attrseq, $attr) = @_;
	my $out = "<$tag";
	for (@$attrseq) {
		$out .= " $_=\"".encode_entities($attr->{$_}).'"';
	}
	$out .= '>';
	return $out;
}
