#!/usr/bin/perl -w

use strict;

=pod

=head1 NAME

B<nsum> - sum up numbers from text files


=head1 SYNOPSIS

B<nsum> [I<options>] [I<files> ...]


=head1 DESCRIPTION

B<nsum> is a more sophisticated replacement for Suso Banderas' B<numsum>.  It
adds up numbers embedded in text files, discarding all non-numerical data.  In
particular it can add up tabulated ASCII numerical data separated by commas or
white space.

B<nsum> processes the files given on the command line, or standard input if the
first file argument is "-" or none is given.  The options B<-f>, B<-g> and
B<-i> determine which kind of numbers B<nsum> looks for - floating point in
decimal and optionally exponential notation or just integers.  Most other
options control its output - just the total, or row and/or column sums and
optionally partial sums.  The B<-s> option gives you mean and standard
deviation instead of sums.

The current version uses compensated (Kahan) summation for sums and the
Knuth/Welford algorithm for computing the mean and standard deviation to
minimise numerical errors.


=head1 OPTIONS

=over

=item B<-a>

Print requested sums for all files separately, instead of a grand total at the
end.  This applies to totals (B<-t>) and column sums (B<-c>).  It has no effect
when reading from stdin.  B<+a> disables this feature.

=item B<-c>

Print column sums, separated by tabulator characters.  B<+c> disables this
feature.

=item B<-f>

Sum up floating-point numbers in decimal (not exponential) notation.  "e" and
"E" are regarded as part of record separators.

=item B<-g>

Sum up floating-point numbers in decimal or exponential notation.  This is the
default.

=item B<-i>

Sum up integers.  Period characters are taken as part of record separators, not
decimal points.

=item B<-I>

Print input data, without chaff but in its original format.  If B<-p> is also
given, input columns and cumulative sums alternate, starting with input data.
Columns which are not present in a line but for which partial sums exist are
printed as 0 to keep the alignment predictable, but do not count as a zero
datum towards the mean for B<-s>.  B<+I> disables this feature.

=item B<-p>

Print partial column sums as columns are added up.  If B<-a> is given, the
partial sums are file by file, otherwise they are cumulative across files.
If B<-s> is given, the mean and standard deviation of each column's data so far
is printed instead of the partial sum.  B<+p> disables this feature.

=item B<-r>

Print row sums.  For rows which do not contain any numbers, an empty line (not
0) is output.  If B<-p> is also given, the row sum is the last column output
(or for B<-s>, the last two columns), after the partial column sums.  B<+r>
disables this feature.

=item B<-s>

Instead of each sum, print mean and standard deviation.  Affects all of B<-c>,
B<-p>, B<-r> and B<-t>.  Two values will be printed everywhere in place of the
single sum.  B<+s> disables this feature.

=item B<-t>

Print overall total.  This is enabled by default.  B<+t> disables it.

=back


=head1 FREQUENTLY ANTICIPATED QUESTIONS (FAQ)

=over

=item Why can't I set the record separator?

Because B<nsum> ignores all non-numerical data anyway, this is superfluous.

=item Why can't I tell B<nsum> to print specific row/column sums only?

Because you can use B<sed> to pick out rows and B<awk> or B<cut> to select
columns.

=item Why can't B<nsum> print the integer/fractional part of its sum as B<numsum> can?

Because B<awk> can do that after B<nsum> has printed the sum. (C<int()> resp.
C<...% 1>)

=item How can I select the complete contiguous output of B<nsum> after pasting something into its stdin?

Use B<reservoir> by Simon Tatham,
L<http://www.chiark.greenend.org.uk/~sgtatham/utils/>.

=back

You get the drift.


=head1 SEE ALSO

B<numsum>(1), B<numaverage>(1), B<awk>(1), B<sed>(1), B<cut>(1)


=head1 COPYLEFT

B<nsum> is Copyright (c) 2009-2010 Volker Schatz.  It may be copied and/or
modified under the same terms as Perl.

=cut


my $numopts= qr/[ifg]/;
my $modeopts= qr/[acIprst]/;

my $numfmt= "g";
my %mode= ( "t" => 1 );

while( @ARGV && $ARGV[0] =~ /^[-+]./ ) {
  my $opt= shift @ARGV;
  while( length($opt) > 1 )
  {
    if( $opt =~ /^-($numopts)/o ) {
      $numfmt= $1;
    }
    elsif( $opt =~ /^-($modeopts)/o ) {
      $mode{$1}= 1;
    }
    elsif( $opt =~ /^\+($modeopts)/o ) {
      $mode{$1}= 0;
    }
    else {
      print STDERR "nsum: Unknown option ", substr($opt,0,2), ".\n" unless $opt =~ /^-h/;
      print STDERR <<EOF;
Usage: nsum [options] [files ...]
Sums up numbers from text files, ignoring the rest.
The most important options:
-c	print column sums
-f	sum up floating-point numbers in decimal notation
-g	sum up floating-point numbers in decimal or exponential notation
-i	sum up integers
-r	print row sums
-t	print total
Extract documentation with pod2man \`which nsum\` > nsum.1
EOF
      exit 1;
    }
    $opt =~ s/^-./-/;
    $opt =~ s/^\+./\+/;
  }
}

if( ! grep $_ =~ /^[cCIprt]$/ && $mode{$_}, keys %mode ) {
  print STDERR "nsum: All sums disabled, nothing to do.\n";
  exit 1;
}

# Shortcut for deciding if any sum is to be computed
$mode{"+"}= grep $_ =~ /^[cprt]$/ && $mode{$_}, keys %mode;
# Shortcut for deciding if anything is to be printed in each row
$mode{"-"}= grep $_ =~ /^[Ipr]$/ && $mode{$_}, keys %mode;

# Output field separator
my $sep= "\t";

# Nomenclature:
# Without -s: *sum sum of values, *aux error compensation variable
# With -s: *sum mean of values, *aux auxiliary sum for computing variance, *num
# number of values added up
# Grand totals:
my $tsum= 0;
my $taux= 0;
my $tnum= 0;
# Column sums:
my @csums;
my @cauxs;
my @cnums;

my %numregexes= ( "i" => qr/-?\d+/,  "f" => qr/-?(?:\d+(?:\.\d*)?|\.\d+)/,
		"g" => qr/-?(?:\d+(?:\.\d*)?|\.\d+)(?:e[-+]?\d+)?/i );

my $nre= $numregexes{$numfmt};


# Add a new value to a sum using compensated (Kahan) summation.
# -> New value
#    Reference to sum
#    Reference to error carry
sub addtosum
{
  my ($val, $sum, $delta)= @_;

  $val -= $$delta;
  my $oldsum= $$sum;
  $$sum += $val;
  $$delta= ($$sum - $oldsum) - $val;
}


# Add another value to the running statistical variables using the algorithm by
# Knuth and Welford.  The running accumulation variables have to be initialised
# to zero before the first call.  The variance is sqrt( auxiliary variable /
# the number of values). 
# -> New value
#    Reference to mean of values
#    Reference to auxiliary variable for variance
#    Reference to number of values
sub addtostat
{
  my ($val, $mean, $m2, $num)= @_;

  ++$$num;
  my $delta= $val - $$mean;
  $$mean += $delta / $$num;
  $$m2 += $delta * ($val - $$mean);
}


# Add the elements of one array to another, extending the destination as
# needed.  If -s is active, the source values are added to the three running
# statistical variable arrays.
# -> Reference to source array
#    Reference to destination array / array of means (for -s)
#    Reference to array of variables for error compensation / variance (if -s)
#    Reference to array containing counts (if -s is active)
sub addary
{
  my ($src, $dest, $aux, $cnt)= @_;

  if( $#$src > $#$dest ) {
    my $news= $#$src - $#$dest;
    push @$dest, (0) x $news;
    push @$aux, (0) x $news;
    push @$cnt, (0) x $news;
  }
  if( $mode{"s"} ) {
    for my $ind (0..$#$src) {
      addtostat $$src[$ind], \$$dest[$ind], \$$aux[$ind], \$$cnt[$ind];
    }
  }
  else {
    for my $ind (0..$#$src) {
      addtosum $$src[$ind], \$$dest[$ind], \$$aux[$ind];
    }
  }
}


# Print sum or (for -s) mean and standard deviation, separated by space.
# -> Sum of values / mean of values (if -s is active)
#    Auxiliary variable for error compensation / variance (if -s is active)
#    Number of values (used if -s is active)
sub printsum
{
  my ($sum, $aux, $num)= @_;

  if( $mode{"s"} ) {
    print $sum, $sep, sqrt($aux/$num);
  }
  else {
    print $sum-$aux;
  }
}


# Print output data for one row.
sub printrow
{
  my ($in, $csums, $cauxs, $cnums, $rsum, $raux, $rnum)= @_;
  my $nextsep= "";

  if( defined($csums) ) {
    for my $ind (0..$#$csums) {
      if( defined($in) ) {
	print $$in[$ind] || 0, $sep;
      }
      printsum $$csums[$ind], $$cauxs[$ind], $$cnums[$ind];
      print $sep unless $ind == $#$csums;
    }
    $nextsep= $sep;
  }
  elsif( defined($in) ) {
    print join($sep, @$in);
    $nextsep= $sep;
  }
  if( defined($rsum) ) {
    print $nextsep;
    printsum $rsum, $raux, $rnum;
  }
  print "\n";
}


# Process one file.
# -> Reference to IO handle
sub procfile
{
  my ($IN)= @_;
  # File totals:
  my $fsum= 0;
  my $faux= 0;
  my $fnum= 0;
  my $fsumref= $mode{"a"}? \$fsum : \$tsum;
  my $fauxref= $mode{"a"}? \$faux : \$taux;
  my $fnumref= $mode{"a"}? \$fnum : \$tnum;
  # File column sums:
  my $csumref= $mode{"a"}? [] : \@csums;
  my $cauxref= $mode{"a"}? [] : \@cauxs;
  my $cnumref= $mode{"a"}? [] : \@cnums;
  my $addvalue= $mode{"s"}? \&addtostat : \&addtosum;

  while( <$IN> )
  {
    my @nums= /.*?($nre)/og;
    # Row sums:
    my $rsum= 0;
    my $raux= 0;
    my $rnum= 0;

    if( @nums )
    {
      if( $mode{"c"} || $mode{"p"} ) {
	addary \@nums, $csumref, $cauxref, $cnumref;
      }
      if( $mode{"r"} ) {
	for (@nums) {
	  &$addvalue($_, \$rsum, \$raux, \$rnum);
	}
      }
      if( $mode{"t"} ) {
	for (@nums) {
	  &$addvalue($_, $fsumref, $fauxref, $fnumref);
	}
      }
      printrow $mode{"I"}? \@nums : undef, 
	       $mode{"p"}? $csumref : undef, $cauxref, $cnumref,
	       $mode{"r"}? $rsum : undef, $raux, $rnum
		 if $mode{"-"};
    }
    elsif( $mode{"-"} ) {
      print "\n";
    }
  }
  if( $mode{"a"} ) {
    printrow undef, $csumref, $cauxref, $cnumref if $mode{"c"};
    if( $mode{"t"} ) {
      printsum $fsum, $faux, $fnum;
      print "\n";
    }
  }
}


if( !@ARGV || $ARGV[0] eq "-" ) {
  $mode{"a"}= 0;
  procfile *STDIN{IO};
}
else {
  for (@ARGV) {
    open( my $hdl, $_ );
    unless( $hdl ) {
      print STDERR "nsum: Cannot open `$_'.  Skipping.\n";
      next;
    }
    procfile $hdl;
    close $hdl;
  }
}

unless( $mode{"a"} ) {
  printrow undef, \@csums, \@cauxs, \@cnums if $mode{"c"};
  if( $mode{"t"} ) {
    printsum $tsum, $taux, $tnum;
    print "\n";
  }
}