#!/usr/bin/perl -w
use strict;
use warnings;

use XML::Parser;
use Text::CSV;
use Data::Dumper;
use Getopt::Long;
use File::Basename;

sub usage {
	my $prog = basename($0);
	die <<END
usage: $prog <input.xml >output.csv

options:
	-strip n    strip header prefix
	-stripns    strip namespace from header
	-quoty      quote every not-null value
	-multi a,b  tags a,b appear multiple times in a container
	-index      for multi tags, include an index column

note:
	Rows are printed for each inner 'multi' tag only.
END
}

my $strip = undef;
my $stripns = undef;
my $quoty = 0;
my $multi = undef;
my $index = 0;

sub get_options {
	GetOptions("strip=i" => \$strip,
		   "stripns" => \$stripns,
		   "quoty" => \$quoty,
		   "multi=s" => \$multi,
		   "index" => \$index,
	) and @ARGV == 0 or usage();
	if (defined $multi) {
		$multi = [split /[ ,]+/, $multi]
	}
	return;
}

sub trim {
	my ($s) = @_;
	$s =~ s/\A\s+|\s+\z//gs if defined $s;
	return $s;
}

# Returns a nice sort of XML::Parser, and an empty output tree,
# which you can fill with ->parse or ->parsefile.
# This preserves the order of elements, unlike XML::Simple.
# It converts XML to S-expressions (LISP), using Perl arrays.
# Example: <foo> Hello <bar> World </bar> </foo>
#		  [['foo', 'Hello', ['bar', 'World']]]
# $opt{attr} means include attributes, like ['.id', 1234]
# TODO handle XML fragments (with zero or many top-level elements)
sub nice_xml_parser {
	my (%opt) = @_;

	my $tree = [];
	my @stack = ($tree);
	my $text = "";

	my $push_text = sub {
		if ($text =~ /\S/) {
			push @{$stack[-1]}, trim($text);
		}
		$text = "";
	};

	my $parser = XML::Parser->new(
		Handlers => {
			Start => sub {
				my ($p, $tag, @attr) = @_;
				&$push_text;
				my $e = [$tag];
				push @{$stack[-1]}, $e;
				push @stack, $e;
				if ($opt{attr}) {
					while (@attr) {
						my ($k, $v) = splice @attr, 0, 2;
						push @{$stack[-1]}, [".$k", $v];
					}
				}
			},
			End   => sub {
				my ($p, $tag) = @_;
				&$push_text;
				pop @stack;
				die "Malformed XML\n" if !@stack;
			},
			Char  => sub {
				my ($p, $text_part) = @_;
				$text .= $text_part;
			},
		}
	);
	return $parser, $tree;
}

sub row_add {
	my ($row, $name, $value, $headers, $header_ix) = @_;
	my $ix = $header_ix->{$name};
	if (!defined $ix) {
		push @$headers, $name;
		$ix = $header_ix->{$name} = $#$headers;
	}
	$row->[$ix] = $value;
};

sub xml_to_csv {
	my ($tree, $row, $count, $rows, $headers, $header_ix, $path) = @_;
	my ($name, @components) = @$tree;

	$path .= " " if $path ne "";
	$path .= $name;

	# has sub: clone row at start, add tag.i->++count, clone count hash
	# leaf / with text: add tag->text to current row
	# recurse
	# simple: output row at end

	my @sub = grep { ref } @components;
	my @text = grep { !ref } @components;
	my @aggregate_sub = grep { grep {ref} @$_ } @sub;
	my @leaf_sub = grep { !grep {ref} @$_ } @sub;
	my $is_multi = grep { $name eq $_ } @$multi;

	if ($is_multi) {
		$row = [@$row];
		if ($index) {
			row_add($row, "$path #" => ++$count->{$name}, $headers, $header_ix);
		}
		$count = {%$count};
	}
	if (@text) {
		row_add($row, $path => join(' ', @text), $headers, $header_ix);
	}
	my $output_count = 0;
	for my $sub (@leaf_sub, @aggregate_sub) {
		$output_count = xml_to_csv($sub, $row, $count, $rows, $headers, $header_ix, $path);
	}
	if ($is_multi && !$output_count) {
		push @$rows, $row;
		++$output_count;
	}
	return $output_count;
};

sub guess_multi {
	my ($tree, $multi) = @_;
	my ($name, @components) = @$tree;
	my @sub = grep { ref } @components;
	my @sub_names = map { $_->[0] } @sub;
	my %check;
	for my $name (@sub_names) {
		if ($check{$name}++ == 1 && !grep {$_ eq $name} @$multi) {
			push @$multi, $name;
		}
	}
	for my $sub (@sub) {
		guess_multi($sub, $multi);
	}
	return $multi;
}

sub guess_strip {
	my ($headers) = @_;
	my $strip = 0;
	my @test = @$headers;
	while (1) {
		last if grep {!/ /} @test or !@test;
		s/.*? // for @test;
		my %check;
		my $okay = 1;
		for (@test) {
			if ($check{$_}++) {
				$okay = 0;
				last;
			}
		}
		if ($okay) {
			++$strip;
		} else {
			last;
		}
	}
	return $strip;
}

sub main {
	get_options();

	my ($parser, $tree) = nice_xml_parser(attr=>1);
	$parser->parse(\*STDIN);

	my $csv = Text::CSV->new({ binary => 1, always_quote => $quoty, blank_is_undef => $quoty })
		or die "Cannot use CSV: ".Text::CSV->error_diag;
	$csv->eol("\n");
	my $out = \*STDOUT; binmode($out, ":utf8");

	my @headers;
	my @rows;

	$tree = $tree->[0];

	if (!defined $multi) {
		$multi = guess_multi($tree, []);
		if (!@$multi) {
			$multi = [$tree->[0]];
		}
	}

	xml_to_csv($tree, [], {}, \@rows, \@headers, {}, "");

	if (!defined $strip) {
		$strip = guess_strip(\@headers);
	}

	if ($strip) {
		for (@headers) {
			my @words = split / /, $_;
			splice @words, 0, $strip;
			$_ = join ' ', @words;
		}
	}
	if ($stripns) {
		for (@headers) {
			s/\b\S+?://g;
		}
	}

	for my $r (\@headers, @rows) {
		if (@$r < @headers) {
			push @$r, (undef) x (@headers-@$r);
		}
		$csv->print($out, $r)
			or die "failed: \$csv->print: $!";
	}
	return;
}

main();
