#!/bin/sh

htmlsplit() {
	perl -e '$_ = join "",<STDIN>; tr/\n\r \t/ /s; s/</\n</g; s/>/>\n/g; s/\n ?\n/\n/g; s/^ ?\n//s; s/ $//s; print'
}

html2txt() {
	tr -d '' | htmlsplit | grep -v '^<' |
	perl -pe 's/^\s+//; s/&quot;/"/g; s/&amp;/\&/g; s/&lt;/</g; s/&gt;/>/g; s/&#(\d+);/chr($1)/ge;' |
	tr -s '
' | tr -s ' ' | perl -pe 's/&nbsp;/ /g;'
}

pair_lines() {
	perl -e '
		$n = $ARGV[0] || 2;
		while(defined ($_=<STDIN>)) {
			chomp;
			push @q, $_;
			if (@q == $n) {
				print join "\t", @q;
				print "\n";
				shift @q;
			}
		}
	'
}



N="$1"
URL="$2"
if [ -n "$URL" ]; then 
	wget "$URL" -O-
else
	cat
fi |
htmlsplit |
perl -ne '/^<BODY\b/i and $a=1; if ($a) { print }' |
html2txt |
tr -sc 'a-zA-Z0-9' '
' |
tr 'A-Z' 'a-z' |
if [ -z "$INCLUDE_COMMON_WORDS" ]; then
	grep -v -w -e a -e the -e and -e in -e on -e at -e for -e by -e with -e are -e is -e am -e you -e they -e it -e is -e my -e to -e be -e we -e us -e or -e of -e s -e '^.$' -e do -e but -e from -e our -e your -e not
else
	cat
fi |
pair_lines "$N" |
sort | uniq -c | sort -rn
