--- loncom/build/lpml_parse.pl	2002/04/08 12:51:03	1.44
+++ loncom/build/lpml_parse.pl	2002/04/13 19:29:32	1.45
@@ -4,11 +4,15 @@
 # Run "perldoc ./lpml_parse.pl" in order to best view the software
 # documentation internalized in this program.
 
+# --------------------------------------------------------- Distribution notice
+# This script is distributed with the LPML software project available at
+# http://lpml.sourceforge.net
+
 # --------------------------------------------------------- License Information
 # The LearningOnline Network with CAPA
 # lpml_parse.pl - Linux Packaging Markup Language parser
 #
-# $Id: lpml_parse.pl,v 1.44 2002/04/08 12:51:03 harris41 Exp $
+# $Id: lpml_parse.pl,v 1.45 2002/04/13 19:29:32 harris41 Exp $
 #
 # Written by Scott Harrison, codeharrison@yahoo.com
 #
@@ -42,7 +46,7 @@
 # 11/4,11/5,11/6,11/7,11/16,11/17 - Scott Harrison
 # 12/2,12/3,12/4,12/5,12/6,12/13,12/19,12/29 - Scott Harrison
 # YEAR=2002
-# 1/8,1/9,1/29,1/31,2/5,3/21,4/8 - Scott Harrison
+# 1/8,1/9,1/29,1/31,2/5,3/21,4/8,4/12 - Scott Harrison
 #
 ###
 
@@ -63,8 +67,7 @@
 #
 # I am using a multiple pass-through approach to parsing
 # the lpml file.  This saves memory and makes sure the server
-# will never be overloaded.  At some point, I expect the
-# first two steps will be implemented with my XFML
+# will never be overloaded.
 #
 # This is meant to parse files meeting the lpml document type.
 # See lpml.dtd.  LPML=Linux Packaging Markup Language.
@@ -74,9 +77,12 @@ use HTML::TokeParser;
 my $usage=<<END;
 **** ERROR ERROR ERROR ERROR ****
 Usage is for lpml file to come in through standard input.
-1st argument is the mode of parsing.
-2nd argument is the category permissions to use (runtime or development)
-3rd argument is the distribution (default,redhat6.2,debian2.2,redhat7.1,etc).
+1st argument is the mode of parsing:
+    install,configinstall,build,rpm,dpkg,htmldoc,textdoc,status
+2nd argument is the category permissions to use:
+    typical choices: runtime,development
+3rd argument is the distribution:
+    typical choices: default,redhat6.2,debian2.2,redhat7
 4th argument is to manually specify a sourceroot.
 5th argument is to manually specify a targetroot.
 
@@ -86,6 +92,8 @@ Example:
 
 cat ../../doc/loncapafiles.lpml |\\
 perl lpml_parse.pl html development default /home/sherbert/loncapa /tmp/install
+
+For more information, type "perldoc lpml_parse.pl".
 END
 
 # ------------------------------------------------- Grab command line arguments
@@ -121,14 +129,14 @@ if (@ARGV) {
 if (@ARGV) {
     $targetroot = shift @ARGV;
 }
-$sourceroot=~s/\/$//;
-$targetroot=~s/\/$//;
+$sourceroot=~s/\/$//; # remove trailing directory slash
+$targetroot=~s/\/$//; # remove trailing directory slash
 $sourcerootarg=$sourceroot;
 $targetrootarg=$targetroot;
 
 my $logcmd='| tee -a WARNINGS';
 
-my $invocation;
+my $invocation; # Record how the program was invoked
 # --------------------------------------------------- Record program invocation
 if ($mode eq 'install' or $mode eq 'configinstall' or $mode eq 'build') {
     $invocation=(<<END);
@@ -141,14 +149,13 @@ if ($mode eq 'install' or $mode eq 'conf
 END
 }
 
-# ---------------------------------------------------- Start first pass through
-my @parsecontents = <>;
-my $parsestring = join('',@parsecontents);
-my $outstring='';
+# -------------------------- Start first pass through (just gather information)
+my @parsecontents=<>;
+my $parsestring=join('',@parsecontents);
 
 # Need to make a pass through and figure out what defaults are
-# overrided.  Top-down overriding strategy (leaves don't know
-# about distant leaves).
+# overrided.  Top-down overriding strategy (tree leaves don't know
+# about distant tree leaves).
 
 my @hierarchy;
 $hierarchy[0]=0;
@@ -157,20 +164,32 @@ my $token;
 $parser = HTML::TokeParser->new(\$parsestring) or
     die('can\'t create TokeParser object');
 $parser->xml_mode('1');
-my %hash;
-my $key='';
-while ($token = $parser->get_token()) {
+my %setting;
+
+# Values for the %setting hash
+my $defaultset=1; # a default setting exists for a key
+my $distset=2; # a distribution setting exists for a key
+               # (overrides default setting)
+
+my $key=''; # this is a unique key identifier (the token name with its
+            # coordinates inside the hierarchy)
+while ($token = $parser->get_token()) { # navigate through $parsestring
     if ($token->[0] eq 'S') {
 	$hloc++;
 	$hierarchy[$hloc]++;
 	$key=$token->[1].join(',',@hierarchy[0..($hloc-1)]);
 	my $thisdist=' '.$token->[2]{'dist'}.' ';
 	if ($thisdist eq ' default ') {
-	    $hash{$key}=1; # there is a default setting for this key
+	    $setting{$key}=$defaultset;
 	}
-	elsif ($dist && $hash{$key}==1 && $thisdist=~/\s$dist\s/) {
-	    $hash{$key}=2; # disregard default setting for this key if
-	                   # there is a directly requested distribution match
+	elsif (length($dist)>0 &&
+	       $setting{$key}==$defaultset &&
+	       $thisdist=~/\s$dist\s/) {
+	    $setting{$key}=$distset;
+                   # disregard default setting for this key if
+                   # there is a directly requested distribution match
+                   # (in other words, there must first be a default
+	           # setting for a key in order for it to be overridden)
 	}
     }
     if ($token->[0] eq 'E') {
@@ -178,57 +197,79 @@ while ($token = $parser->get_token()) {
     }
 }
 
-# --------------------------------------------------- Start second pass through
+# - Start second pass through (clean up the string to allow for easy rendering)
+
+# The string is cleaned up so that there is no white-space surrounding any
+# XML tag.  White-space inside text 'T' elements is preserved.
+
+# Clear up memory
 undef($hloc);
 undef(@hierarchy);
 undef($parser);
-$hierarchy[0]=0;
+$hierarchy[0]=0; # initialize hierarchy
 $parser = HTML::TokeParser->new(\$parsestring) or
     die('can\'t create TokeParser object');
 $parser->xml_mode('1');
-my $cleanstring;
-while ($token = $parser->get_token()) {
-    if ($token->[0] eq 'S') {
+my $cleanstring; # contains the output of the second step
+while ($token = $parser->get_token()) { # navigate through $parsestring
+    if ($token->[0] eq 'S') { # a start tag
 	$hloc++;
 	$hierarchy[$hloc]++;
 	$key=$token->[1].join(',',@hierarchy[0..($hloc-1)]);
-	my $thisdist=' '.$token->[2]{'dist'}.' ';
+
+	# Surround tagdist (the dist attribute of an XML tag)
+	# with white-space to allow for uniform searching a few
+	# lines below here.
+	my $tagdist=' '.$token->[2]{'dist'}.' ';
+
 	# This conditional clause is set up to ignore two sets
 	# of invalid conditions before accepting entry into
-	# the cleanstring.
-	if ($hash{$key}==2 and
-	    !($thisdist eq '  ' or $thisdist =~/\s$dist\s/)) {
+	# $cleanstring.
+
+	# Condition #1: Ignore this part of the string if the tag 
+	# has a superior distribution-specific setting and the tag
+	# being evaluated has a dist setting something other than
+	# blank or $dist.
+	if ($setting{$key}==$distset and
+	    !($tagdist eq '  ' or $tagdist =~/\s$dist\s/)) {
 	    if ($token->[4]!~/\/>$/) {
 		$parser->get_tag('/'.$token->[1]);
 		$hloc--;
 	    }
 	}
-	elsif ($thisdist ne '  ' and $thisdist!~/\s$dist\s/ and
-	       !($thisdist eq ' default ' and $hash{$key}!=2)) {
+	# Condition #2: Ignore this part of the string if the tag has
+	# is not blank and does not equal dist and
+	# either does not equal default or it has a prior $dist-specific
+	# setting.
+	elsif ($tagdist ne '  ' and $tagdist!~/\s$dist\s/ and
+	       !($tagdist eq ' default ' and $setting{$key}!=$distset)) {
 	    if ($token->[4]!~/\/>$/) {
 		$parser->get_tag('/'.$token->[1]);
 		$hloc--;
 	    }
 	}
+	# In other words, output to $cleanstring if the tag is dist=default
+	# or if the tag is set to dist=$dist for the first time.  And, always
+	# output when dist='' is not present.
 	else {
 	    $cleanstring.=$token->[4];
 	}
-	if ($token->[4]=~/\/>$/) {
-#	    $hloc--;
-	}
     }
-    if ($token->[0] eq 'E') {
+    # Note: this loop DOES work with <tag /> style markup as well as
+    # <tag></tag> style markup since I always check for $token->[4] ending
+    # with "/>".
+    if ($token->[0] eq 'E') { # an end tag
 	$cleanstring.=$token->[2];
 	$hloc--;
     }
-    if ($token->[0] eq 'T') {
+    if ($token->[0] eq 'T') { # text contents inside tags
 	$cleanstring.=$token->[1];
     }
 }
 $cleanstring=&trim($cleanstring);
 $cleanstring=~s/\>\s*\n\s*\</\>\</g;
 
-# ---------------------------------------------------- Start final pass through
+# -------------------------------------------- Start final (third) pass through
 
 # storage variables
 my $lpml;
@@ -1559,11 +1600,13 @@ sub trim {
 
 =head1 NAME
 
-lpml_parse.pl - This is meant to parse LPML files (Linux Packaging Markup Language)
+lpml_parse.pl - This is meant to parse files meeting the lpml document type.
 
 =head1 SYNOPSIS
 
-Usage is for lpml file to come in through standard input.
+<STDIN> | perl lpml_parse.pl <MODE> <CATEGORY> <DIST> <SOURCE> <TARGET>
+
+Usage is for the lpml file to come in through standard input.
 
 =over 4
 
@@ -1595,19 +1638,57 @@ Only the 1st argument is mandatory for t
 Example:
 
 cat ../../doc/loncapafiles.lpml |\\
-perl lpml_parse.pl html default /home/sherbert/loncapa /tmp/install
+perl lpml_parse.pl html runtime default /home/sherbert/loncapa /tmp/install
 
 =head1 DESCRIPTION
 
-I am using a multiple pass-through approach to parsing
-the lpml file.  This saves memory and makes sure the server
-will never be overloaded.
+The general flow of the script is to get command line arguments, run through
+the XML document three times, and output according to any desired mode:
+install, configinstall, build, rpm, dpkg, htmldoc, textdoc, and status.
+
+A number of coding decisions are made according to the following principle:
+installation software must be stand-alone.  Therefore, for instance, I try
+not to use the GetOpt::Long module or any other perl modules.  (I do however
+use HTML::TokeParser.)  I also have tried to keep all the MODES of
+parsing inside this file.  Therefore, format_TAG subroutines are fairly
+lengthy with their conditional logic.  A more "elegant" solution might
+be to dynamically register the parsing mode and subroutines, or maybe even work
+with stylesheets.  However, in order to make this the installation back-bone
+of choice, there are advantages for HAVING EVERYTHING IN ONE FILE.
+This way, the LPML installation software does not have to rely on OTHER
+installation software (a chicken versus the egg problem).  Besides, I would
+suggest the modes of parsing are fairly constant: install, configinstall,
+build, rpm, dpkg, htmldoc, textdoc, and status.
+
+Another coding decision is about using a multiple pass-through approach to
+parsing the lpml file.  This saves memory and makes sure the server will never
+be overloaded.  During the first pass-through, the script gathers information
+specific as to resolving what tags with what 'dist=' attributes are to be used.
+During the second pass-through, the script cleans up white-space surrounding
+the XML tags, and filters through the tags based on information regarding the
+'dist=' attributes (information gathered in the first pass-through).
+The third and final pass-through involves formatting and rendering the XML
+into whatever XML mode is chosen: install, configinstall, build, rpm, dpkg,
+htmldoc, textdoc, and status.
+
+The hierarchy mandated by the DTD does not always correspond to the hierarchy
+that is sensible for a Makefile.  For instance, in a Makefile it is sensible
+that soft-links are installed after files.  However, in an LPML document, it
+is sensible that files and links be considered together and the writer of the
+LPML document should be free to place things in whatever order makes best
+sense in terms of LOOKING at the information.  The complication that arises
+is that the parser needs to have a memory for passing values from
+leaves on the XML tree to higher-up branches.  Currently, this memory is
+hard-coded (like with the @links array), but it may benefit from a more
+formal approach in the future.
 
 =head1 README
 
-I am using a multiple pass-through approach to parsing
-the lpml file.  This saves memory and makes sure the server
-will never be overloaded.
+This parses an LPML file to generate information useful for
+source to target installation, compilation, filesystem status
+checking, RPM and Debian software packaging, and documentation.
+
+More information on LPML is available at http://lpml.sourceforge.net.
 
 =head1 PREREQUISITES
 
@@ -1621,7 +1702,7 @@ linux
 
 =head1 SCRIPT CATEGORIES
 
-Packaging/Administrative
+UNIX/System_administration
 
 =head1 AUTHOR