#!/usr/bin/perl

use strict;
use warnings;

our $PROGRAM = "dbidump_check_nulls";
our $VERSION = "2.0";
our $USAGE = "usage: $PROGRAM dump_dir real.schema db_name";

sub version {
	print <<End;
dbidump_check_nulls $VERSION
Copyright (C) 1997-2004 myinternet Limited.
Copyright (C) 2004-2005 Sam Watkins.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
End
	exit 0;
}

use IO::File;
use lib "/usr/lib/dbischema";
use XMLParser;

BEGIN {
	$^W = 0;
}
use Schema;
use Message;
BEGIN {
	$^W = 1;
}

$| = 1;

my ($dump_dir, $schema_file, $db_name) = @ARGV;

if (@ARGV) {
	if ($ARGV[0] eq "--version") { version(); }
	if ($ARGV[0] eq "--help") { help(); }
}

unless (@ARGV == 3 && -d $dump_dir && -f $schema_file) {
	print STDERR <<End;
$USAGE

  for help, try the --help option
End
	exit 1;
}

my $database = XMLParser->new->parsefile($schema_file)->database($db_name) || fatal "cannot find database `$db_name'";

print_header();

for my $table ($database->tables) {
	my $dumpfile = "$dump_dir/".$table->name.".dump";
	check_nulls($table, $dumpfile);
}

# this function escapes tabs to \t, newlines to \n, null to \0, \ to \\, and undef to null
sub escape_dump {
	my $v = shift;
	return "\0" unless defined $v;
	for ($v) {
		s/\\/\\\\/g;
		s/\t/\\t/g;
		s/\n/\\n/g;
		s/\0/\\0/g;
	}
	return $v;
}

sub unescape_dump {
	my $v = shift;
	return undef if $v eq "\0";
	for ($v) {
		s/(?:(?<=[^\\])|^)((?:\\\\)*)\\0/$1\0/g;
		s/(?:(?<=[^\\])|^)((?:\\\\)*)\\n/$1\n/g;
		s/(?:(?<=[^\\])|^)((?:\\\\)*)\\t/$1\t/g;
		s/\\\\/\\/g;
	}
	return $v;
}

sub parse_row_dump {
	my $row = shift;
	return undef unless $row =~ s/^\t//;
	chomp $row;
	my @row = split /\t/, $row, -1;
	for (@row) { $_ = unescape_dump($_); }
	return \@row;
}

sub format_row_dump {
	my $ra_row = shift;
	return defined $ra_row
		? (join '', (map {"\t".escape_dump($_)} @$ra_row)) . "\n"
		: undef;
}

# returns a function which can be used to output text to a file
# parameter may be a filehandle ref, or nothing (for STDOUT), '>filename' or '>>filename'
sub write_file {
	my $filespec = shift;
	my $fh;
	
	$filespec = \*STDOUT unless defined $filespec;
	
	if (ref $filespec) {
		$fh = $filespec;
	} else {
		$fh = new IO::File($filespec)
			or die "cannot open $filespec: $!";
	}
	
	return sub {
		my $line = shift;
		defined $line
			? print $fh $line
			: undef $fh; # or close $file
	}
}

sub read_file {
	my $filespec = shift || \*STDIN;
	
	my $fh;
	if (ref $filespec) {
		$fh = $filespec;
	} else {
		$fh = new IO::File($filespec)
		or die "cannot open $filespec: $!";
	}
	
	return sub { scalar(<$fh>) }
}

sub write_dump {
	my $text_writer = shift;
	
	return sub {
		my $row = shift;
		&$text_writer(format_row_dump($row));
	}
}

# read_dump ignores comments (lines that don't start with a tab)
sub read_dump {
	my $text_reader = shift;
	
	return sub {
		my ($line, $row);
		do {
			$line = &$text_reader;
			return undef unless defined $line;
			$row = parse_row_dump($line);
		} until (defined $row);
		
		return $row;
	}
}

sub write_dump_file {
	my ($table, $fields) = @_;
	my $dump_writer = write_dump(write_file(">$table"));
	&$dump_writer($fields);
	return $dump_writer;
}

sub read_dump_file {
	my ($table) = @_;
	my $text_reader = read_file("<$table");
	my $dump_reader = read_dump($text_reader);
	return $dump_reader, &$dump_reader;
}

#-----------------------------------------------------------------------
# check nulls in one table
#-----------------------------------------------------------------------

sub print_header {
	my $s = "table";
        print $s, ' 'x(20-length $s);
	print "\t\t\t  rows\n";
	$s = "field";
        print "   ", ' 'x(20-length $s), $s;
	$s = "type";
        print $s, ' 'x(12-length $s), "   ";
	for $s ("null", "blank", "zero") {
        	print "\t$s";
	}
       	print "\n";
}

sub check_nulls {
	my ($table, $dumpfile) = @_;

	print "\n", $table->name;
	print ' 'x(20-length($table->name));

	my @field_names = $table->field_names;
	my @fields = $table->fields;
	my $n_fields = @fields;

	my ($reader, $df_field_names) = read_dump_file($dumpfile);
	fatal "\tschema does not match dumpfile" if (join "\0", @field_names) ne (join "\0", @$df_field_names);

	my @null_counts = (0) x $n_fields;
	my @blank_counts = (0) x $n_fields;
	my @zero_counts = (0) x $n_fields;

	# the main loop to convert the table
	my $count = 0;
	my $row;
	my $v;

	while(1) {
		$row = &$reader;
		last unless defined $row;

		for (my $i = 0; $i < $n_fields; ++$i) {
			$v = $row->[$i];
			if (!defined $v) {
				$null_counts[$i]++;
			} elsif ($v eq '') {
				$blank_counts[$i]++;
			} elsif ($v =~ /^0(\.0+)?$/) {
				$zero_counts[$i]++;
			}
		}
		$count ++;
	}

	print "\t\t\t";
	print $count == 0 ? "*" : " ";
	print " $count\n";

	for (my $i = 0; $i < $n_fields; ++$i) {
		print ' 'x(20-length($field_names[$i])), $field_names[$i], "  ";
		my $sql_type = $fields[$i]->sql_type;
		print $fields[$i]->null ? "   " : "NN ";
		print $sql_type, ' 'x(12-length( $sql_type ));
		for ($null_counts[$i], $blank_counts[$i], $zero_counts[$i]) {
			if ($_ == $count) { $_ = "* $_"; }
			elsif ($_ > $count / 2) { $_ = "- ".($count - $_); }
			else { $_ = "  $_"; }
			print "\t$_";
		}
		print "\n";
	}
}

sub help {
	print <<End;
$USAGE

  This program examines a database dump produced by dbidump and dbischema,
  and outputs statistics on the number of NULL, 0 and "" values
  in each field.

  The arguments are:
    - a directory containing .dump files output by dbidump
    - a "real" schema output by dbischema parse
    - the name of the database (a dbischema file can describe many databases)

  This program does not touch the actual database, or modify the files,
  it just produces a report.
End
	exit 0;
}
