--- loncom/xml/lonxml.pm 2005/01/28 21:08:45 1.353 +++ loncom/xml/lonxml.pm 2005/01/30 12:56:45 1.354 @@ -1,7 +1,7 @@ # The LearningOnline Network with CAPA # XML Parser Module # -# $Id: lonxml.pm,v 1.353 2005/01/28 21:08:45 albertel Exp $ +# $Id: lonxml.pm,v 1.354 2005/01/30 12:56:45 www Exp $ # # Copyright Michigan State University Board of Trustees # @@ -368,23 +368,28 @@ sub xmlparse { sub htmlclean { my ($raw,$full)=@_; +# Take care of CRLF etc - my $tree = HTML::TreeBuilder->new; - $tree->ignore_unknown(0); - - $tree->parse($raw); - - my $output= $tree->as_HTML(undef,' '); - - $output=~s/\<(br|hr|img|meta|allow)(.*?)\>/\<$1$2 \/\>/gis; - $output=~s/\<\/(br|hr|img|meta|allow)\>//gis; + $raw=~s/\r\f/\n/gs; $raw=~s/\f\r/\n/gs; + $raw=~s/\r\n/\n/gs; $raw=~s/\n\r/\n/gs; + $raw=~s/\f/\n/gs; $raw=~s/\r/\n/gs; + $raw=~s/\&\#10\;/\n/gs; $raw=~s/\&\#13\;/\n/gs; + +# Generate empty tags, remove wrong end tags + $raw=~s/\<(br|hr|img|meta|allow|basefont)([^\>\/]*?)\>/\<$1$2 \/\>/gis; + $raw=~s/\<\/(br|hr|img|meta|allow|basefont)\>//gis; unless ($full) { - $output=~s/\<[\/]*(body|head|html)\>//gis; + $raw=~s/\<[\/]*(body|head|html)\>//gis; } - - $tree = $tree->delete; - - return $output; +# Make standard tags lowercase + foreach ('html','body','head','meta','h1','h2','h3','h4','b','i','m', + 'table','tr','td','th','p','br','hr','img','embed','font', + 'a','strong','center','title','basefont') { + $raw=~s/\<$_\s*\>/\<$_\>/gis; + $raw=~s/\<\/$_\s*\>/<\/$_\>/gis; + $raw=~s/\<$_\s([^\>]*)\>/<$_ $1\>/gis; + } + return $raw; } sub latex_special_symbols {