#!/usr/bin/perl

# The LearningOnline Network with CAPA
#
# filecompare.pl - script used to help probe and compare file statistics
#
# YEAR=2001
# 9/27, 10/24, 10/25, 11/4 Scott Harrison
# 11/14 Guy Albertelli
# 11/16,11/17 Scott Harrison
#
# $Id: filecompare.pl,v 1.8 2001/11/17 23:00:10 harris41 Exp $
###

###############################################################################
##                                                                           ##
## ORGANIZATION OF THIS PERL SCRIPT                                          ##
##                                                                           ##
## 1. Invocation                                                             ##
## 2. Notes                                                                  ##
## 3. Dependencies                                                           ##
## 4. Process command line arguments                                         ##
## 5. Process file/dir location arguments                                    ##
## 6. Process comparison restrictions                                        ##
## 7. Define output and measure subroutines                                  ##
## 8. Loop through files and calculate differences                           ##
## 9. Subroutines                                                            ##
## 10. POD (plain old documentation, CPAN style)                             ##
##                                                                           ##
###############################################################################

# ------------------------------------------------------------------ Invocation
my $invocation=<<END;
filecompare.pl [ options ... ] [FILE1] [FILE2] [ restrictions ... ]
or
filecompare.pl [ options ... ] [DIR1] [DIR2] [ restrictions ... ]

Restrictions: a list of space separated values (after the file/dir names)
can restrict the comparison.
These values can be: existence, cvstime, age, md5sum, size, lines,
and/or diffs.

Options (before file/dir names):
-p show all files that have the same comparison
-n show all files that have different comparisons
-a show all files (with comparisons)
-q only show file names (based on first file/dir)
-v verbose mode (default)
-bN buildmode (controls exit code of this script; 0 unless...)
   N=1: md5sum=same --> 1; cvstime<0 --> 2
   N=2: same as N=1 except without md5sum
   N=3: md5sum=same --> 1; age<0 --> 2
   N=4: cvstime>0 --> 2
END
unless (@ARGV) {
    print $invocation;
    exit 1;
}

# ----------------------------------------------------------------------- Notes
#
# What are all the different ways to compare two files and how to look
# at the differences?
#
# Ways of comparison:
#   existence similarity
#   cvs time similarity (1st arg treated as CVS source; only for buildmode)
#   age similarity (modification time)
#   md5sum similarity
#   size similarity (bytes)
#   line count difference
#   number of different lines
#
# Quantities of comparison:
#   existence (no,yes); other values become 'n/a'
#   cvstime in seconds
#   age in seconds
#   md5sum ("same" or "different")
#   size similarity (byte difference)
#   line count difference (integer)
#   number of different lines (integer)

# ---------------------------------------------------------------- Dependencies
# implementing from unix command line (assuming bash)
# md5sum, diff, wc -l

# ---------------------------------------------- Process command line arguments
# Flags (before file/dir names):
# -p show all files the same
# -n show all files different
# -a show all files (with comparisons)
# -q only show file names (based on first file/dir)
# -v verbose mode (default)
# -bN build/install mode (returns exitcode)
my $verbose='1';
my $show='all';
my $buildmode=0;
ALOOP: while (@ARGV) {
    my $flag;
    if ($ARGV[0]=~/^\-(\w)/) {
	$flag=$1;
	if ($flag eq 'b') {
	    $ARGV[0]=~/^\-\w(\d)/;
	    $buildmode=$1;
	    shift @ARGV;
	    next ALOOP;
	}
	shift @ARGV;
      SWITCH: {
	  $verbose=0, last SWITCH if $flag eq 'q';
	  $verbose=1, last SWITCH if $flag eq 'v';
	  $show='same', last SWITCH if $flag eq 'p';
	  $show='different', last SWITCH if $flag eq 'n';
	  $show='all', last SWITCH if $flag eq 'a';
	  print($invocation), exit(1);
      }
    }
    else {
	last;
    }
}
dowarn('Verbose: '.$verbose."\n");
dowarn('Show: '.$show."\n");

# ----------------------------------------- Process file/dir location arguments
# FILE1 FILE2 or DIR1 DIR2
my $loc1=shift @ARGV;
my $loc2=shift @ARGV;
my $dirmode='directories';
my @files;
unless ($loc1 and $loc2) {
    print($invocation), exit(1);
}
if (-f $loc1) {
    $dirmode='files';
    @files=($loc1);
}
else {
    if (-e $loc1) {
	@files=`find $loc1 -type f`;
    }
    else {
	@files=($loc1);
    }
    map {chomp; s/^$loc1\///; $_} @files;
}
dowarn('Processing for mode: '.$dirmode."\n");
dowarn('Location #1: '.$loc1."\n");
dowarn('Location #2: '.$loc2."\n");

# --------------------------------------------- Process comparison restrictions
# A list of space separated values (after the file/dir names)
# can restrict the comparison.
my %rhash=('existence'=>0,'cvstime'=>0,'md5sum'=>0,'age'=>0,'size'=>0,
	      'lines'=>0,'diffs'=>0);
my %restrict;
while (@ARGV) {
    my $r=shift @ARGV;
    if ($rhash{$r}==0) {$restrict{$r}=1;}
    else {print($invocation), exit(1);}
}
if (%restrict) {
    dowarn('Restricting comparison to: '.
	 join(' ',keys %restrict)."\n");
}

# --------------------------------------- Define output and measure subroutines
my %OUTPUT=(
         'existence'=>( sub {print 'existence: '.@_[0]; return;}),
	 'md5sum'=>(sub {print 'md5sum: '.@_[0];return;}),
         'cvstime'=>(sub {print 'cvstime: '.@_[0];return;}),
         'age'=>(sub {print 'age: '.@_[0];return;}),
         'size'=>(sub {print 'size: '.@_[0];return;}),
         'lines'=>(sub {print 'lines: '.@_[0];return;}),
         'diffs'=>(sub {print 'diffs: '.@_[0];return;}),
);

my %MEASURE=(
	 'existence' => ( sub { my ($file1,$file2)=@_;
		        my $rv1=(-e $file1)?'yes':'no';
			my $rv2=(-e $file2)?'yes':'no';
			return ($rv1,$rv2); } ),
	 'md5sum'=>( sub { my ($file1,$file2)=@_;
			my ($rv1)=split(/ /,`md5sum $file1`); chop $rv1;
			my ($rv2)=split(/ /,`md5sum $file2`); chop $rv2;
			return ($rv1,$rv2); } ),
	 'cvstime'=>( sub { my ($file1,$file2)=@_;
			my $rv1=&cvstime($file1);
			my @a=stat($file2); my $gmt=gmtime($a[9]);
			my $rv2=&utctime($gmt);
			return ($rv1,$rv2); } ),
         'age'=>( sub {	my ($file1,$file2)=@_;
			my @a=stat($file1); my $rv1=$a[9];
			@a=stat($file2); my $rv2=$a[9];
			return ($rv1,$rv2); } ),
         'size'=>( sub { my ($file1,$file2)=@_;
			my @a=stat($file1); my $rv1=$a[7];
			@a=stat($file2); my $rv2=$a[7];
			return ($rv1,$rv2); } ),
         'lines'=>( sub { my ($file1,$file2)=@_;
			my $rv1=`wc -l $file1`; chop $rv1;
			my $rv2=`wc -l $file2`; chop $rv2;
			return ($rv1,$rv2); } ),
         'diffs'=>( sub { my ($file1,$file2)=@_;
			my $rv1=`diff $file1 $file2 | grep '^<' | wc -l`;
			chop $rv1; $rv1=~s/^\s+//; $rv1=~s/\s+$//;
			my $rv2=`diff $file1 $file2 | grep '^>' | wc -l`;
			chop $rv2; $rv2=~s/^\s+//; $rv2=~s/\s+$//;
			return ($rv1,$rv2); } ),
);

FLOOP: foreach my $file (@files) {
    my $file1;
    my $file2;
    if ($dirmode eq 'directories') {
        $file1=$loc1.'/'.$file;
        $file2=$loc2.'/'.$file;
    }
    else {
        $file1=$loc1;
        $file2=$loc2;
    }
    my ($existence1,$existence2)=&{$MEASURE{'existence'}}($file1,$file2);
    my $existence=$existence1.':'.$existence2;
    my ($cvstime,$md5sum,$age,$size,$lines,$diffs);
    if ($existence1 eq 'no' or $existence2 eq 'no') {
        $md5sum='n/a';
        $age='n/a';
        $cvstime='n/a';
        $size='n/a';
        $lines='n/a';
        $diffs='n/a';
    }
    else {
	if ($buildmode) {
	    my ($cvstime1,$cvstime2)=&{$MEASURE{'cvstime'}}($file1,$file2);
	    $cvstime=$cvstime1-$cvstime2;
	}
	else {
	    $cvstime='n/a';
	}
        my ($age1,$age2)=&{$MEASURE{'age'}}($file1,$file2);
        $age=$age1-$age2;
        my ($md5sum1,$md5sum2)=&{$MEASURE{'md5sum'}}($file1,$file2);
        if ($md5sum1 eq $md5sum2) {
            $md5sum='same';
            $size=0;
            $lines=0;
            $diffs='0:0';
	}
        elsif ($md5sum1 ne $md5sum2) {
            $md5sum='different';
            my ($size1,$size2)=&{$MEASURE{'size'}}($file1,$file2);
            $size=$size1-$size2;
            my ($lines1,$lines2)=&{$MEASURE{'lines'}}($file1,$file2);
            $lines=$lines1-$lines2;
            my ($diffs1,$diffs2)=&{$MEASURE{'diffs'}}($file1,$file2);
            $diffs=$diffs1.':'.$diffs2;
        }
    }
    my $showflag=0;
    if ($show eq 'all') {
        $showflag=1;
    }
    if ($show eq 'different') {
        my @ks=(keys %restrict);
        unless (@ks) {
	    @ks=('existence','cvstime','md5sum','age','size','lines','diffs');
	}
        FLOOP2: for my $key (@ks) {
	    if ($key eq 'existence') {
		if ($existence ne 'yes:yes') {
		    $showflag=1;
		}
	    }
	    elsif ($key eq 'md5sum') {
		if ($md5sum ne 'same') {
		    $showflag=1;
		}
	    }
	    elsif ($key eq 'cvstime' and $buildmode) {
		if ($cvstime!=0) {
		    $showflag=1;
		}
	    }
	    elsif ($key eq 'age') {
		if ($age!=0) {
		    $showflag=1;
		}
	    }
	    elsif ($key eq 'size') {
		if ($size!=0) {
		    $showflag=1;
		}
	    }
	    elsif ($key eq 'lines') {
		if ($lines!=0) {
		    $showflag=1;
		}
	    }
	    elsif ($key eq 'diffs') {
		if ($diffs ne '0:0') {
		    $showflag=1;
		}
	    }
	    if ($showflag) {
		last FLOOP2;
	    }
        }
    }
    elsif ($show eq 'same') {
        my @ks=(keys %restrict);
        unless (@ks) {
	    @ks=('existence','md5sum','cvstime','age','size','lines','diffs');
	}
        my $showcount=length(@ks);
	$showcount-- unless $buildmode;
        FLOOP3: for my $key (@ks) {
	    if ($key eq 'existence') {
		if ($existence ne 'yes:yes') {
		    $showcount--;
		}
	    }
	    elsif ($key eq 'md5sum') {
		if ($md5sum ne 'same') {
		    $showcount--;
		}
	    }
	    elsif ($key eq 'cvstime' and $buildmode) {
		if ($cvstime!=0) {
		    $showcount--;
		}
	    }
	    elsif ($key eq 'age') {
		if ($age!=0) {
		    $showcount--;
		}
	    }
	    elsif ($key eq 'size') {
		if ($size!=0) {
		    $showcount--;
		}
	    }
	    elsif ($key eq 'lines') {
		if ($lines!=0) {
		    $showcount--;
		}
	    }
	    elsif ($key eq 'diffs') {
		if ($diffs ne '0:0') {
		    $showcount--;
		}
	    }
        }
        if ($showcount==0) {
	    $showflag=1;
	}
    }
    if ($buildmode==1) {
        if ($md5sum eq 'same') {
	    exit(1);
	}
        elsif ($cvstime<0) {
	    exit(2);
	}
        else {
	    exit(0);
	}
    }
    elsif ($buildmode==2) {
        if ($cvstime<0) {
	    exit(2);
	}
        else {
	    exit(0);
	}
    }
    elsif ($buildmode==3) {
        if ($md5sum eq 'same') {
	    exit(1);
	}
        elsif ($age<0) {
	    exit(2);
	}
        else {
	    exit(0);
	}
    }
    elsif ($buildmode==4) {
	if ($existence=~/no$/) {
	    exit(3);
	}
        elsif ($cvstime>0) {
	    exit(2);
	}
	elsif ($existence=~/^no/) {
	    exit(1);
	}
        else {
	    exit(0);
	}
    }
    if ($showflag) {
	print "$file";
	if ($verbose==1) {
	    print "\t";
	    print &{$OUTPUT{'existence'}}($existence);
	    print "\t";
	    print &{$OUTPUT{'cvstime'}}($cvstime);
	    print "\t";
	    print &{$OUTPUT{'age'}}($age);
	    print "\t";
	    print &{$OUTPUT{'md5sum'}}($md5sum);
	    print "\t";
	    print &{$OUTPUT{'size'}}($size);
	    print "\t";
	    print &{$OUTPUT{'lines'}}($lines);
	    print "\t";
	    print &{$OUTPUT{'diffs'}}($diffs);
	}
	print "\n";
    }
}

# ----------------------------------------------------------------- Subroutines

sub cvstime {
    my ($f)=@_;
    my $path; my $file;
    if ($f=~/^(.*\/)(.*?)$/) {
	$f=~/^(.*\/)(.*?)$/;
	($path,$file)=($1,$2);
    }
    else {
	$file=$f; $path='';
    }
    my $cvstime;
    if ($buildmode!=3) {
	my $entry=`grep '^/$file/' ${path}CVS/Entries` or
	    die('*** ERROR *** cannot grep against '.${path}.
		'CVS/Entries for ' .$file . "\n");
        my @fields=split(/\//,$entry);
        $cvstime=`date -d '$fields[3] UTC' --utc +"%s"`;
        chomp $cvstime;
    }
    else {
	$cvstime='n/a';
    }
    return $cvstime;
}

sub utctime {
    my ($f)=@_;
    my $utctime=`date -d '$f UTC' --utc +"%s"`;
    chomp $utctime;
    return $utctime;
}

sub dowarn {
    my ($msg)=@_;
    warn($msg) unless $buildmode;
}

# ----------------------------------- POD (plain old documentation, CPAN style)

=head1 NAME

filecompare.pl - script used to help probe and compare file statistics

=head1 SYNOPSIS

filecompare.pl [ options ... ] [FILE1] [FILE2] [ restrictions ... ]

or

filecompare.pl [ options ... ] [DIR1] [DIR2] [ restrictions ... ]

Restrictions: a list of space separated values (after the file/dir names)
can restrict the comparison.
These values can be: existence, cvstime, age, md5sum, size, lines,
and/or diffs.

Options (before file/dir names):

 -p show all files that have the same comparison

 -n show all files that have different comparisons

 -a show all files (with comparisons)

 -q only show file names (based on first file/dir)

 -v verbose mode (default)

=head1 DESCRIPTION

filecompare.pl can work in two modes: file comparison mode, or directory
comparison mode.

Comparisons can be a function of:
* existence similarity
* cvs time similarity (first argument treated as CVS source)
* age similarity (modification time)
* md5sum similarity
* size similarity (bytes)
* line count difference
* number of different lines

filecompare.pl integrates smoothly with the LPML installation language
(linux packaging markup language).  filecompare.pl is a tool that can
be used for safe CVS source-to-target installations.

=head1 README

filecompare.pl integrates smoothly with the LPML installation language
(linux packaging markup language).  filecompare.pl is a tool that can
be used for safe CVS source-to-target installations.

The unique identifier is considered to be the file name(s) independent
of the directory path.

=head1 PREREQUISITES

=head1 COREQUISITES

=head1 OSNAMES

linux

=head1 SCRIPT CATEGORIES

Packaging/Administrative

=cut