File:  [LON-CAPA] / loncom / homework / cleanxml / html_to_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Thu Dec 3 20:40:31 2015 UTC (6 years, 11 months ago) by damieng
Branches: MAIN
CVS tags: HEAD
integrated Daxe, opening in a separate window for now

    1: # The LearningOnline Network
    2: # Second step to clean a file.
    3: #
    4: # $Id: html_to_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ###
   29: 
   30: #!/usr/bin/perl
   31: 
   32: 
   33: package Apache::html_to_xml;
   34: 
   35: use strict;
   36: use utf8;
   37: use warnings;
   38: use HTML::Parser ();
   39: 
   40: # always closing, end tags are ignored:
   41: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext');
   42: 
   43: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe');
   44: 
   45: 
   46: my $result;
   47: my @stack;
   48: my $close_warning;
   49: my $warnings; # 1 = print warnings
   50: 
   51: 
   52: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML.
   53: sub html_to_xml {
   54:   my($textref, $warn) = @_;
   55:   $warnings = $warn;
   56:   $result = '';
   57:   @stack = ();
   58:   $close_warning = '';
   59:   my $p = HTML::Parser->new( api_version => 3,
   60:                           start_h => [\&start, "tagname, attr, attrseq"],
   61:                           end_h   => [\&end,   "tagname"],
   62:                           text_h  => [\&text, "dtext"],
   63:                           comment_h  => [\&comment, "tokens"],
   64:                           declaration_h  => [\&declaration, "tokens"],
   65:                           process_h  => [\&process, "token0"],
   66:                         );
   67:   # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase
   68:   $p->empty_element_tags(1);
   69:   $result .= "<?xml version='1.0' encoding='UTF-8'?>\n";
   70:   $p->parse($$textref);
   71:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
   72:     if ($close_warning ne '') {
   73:       $close_warning .= ', ';
   74:     }
   75:     $close_warning .= $stack[$i];
   76:     $result .= '</'.$stack[$i].'>';
   77:   }
   78:   if ($warnings && $close_warning ne '') {
   79:     print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n";
   80:   }
   81:   return \$result;
   82: }
   83: 
   84: sub start {
   85:   my($tagname, $attr, $attrseq) = @_;
   86:   
   87:   # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files
   88:   # (in problem files there are not so many missing tags)
   89:   # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
   90:   
   91:   if ($tagname eq 'o:p') {
   92:     return;
   93:   }
   94:   
   95:   if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname>
   96:     $result .= "&lt;$tagname&gt;";
   97:     return;
   98:   }
   99:   
  100:   #$tagname = lc($tagname); this is done by default by the parser
  101:   $tagname = fix_tag($tagname);
  102:   if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' &&
  103:       !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) {
  104:     # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse
  105:     # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><FONT FACE="Palatino"><TD...
  106:     # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is)
  107:     if ($warnings) {
  108:       print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n";
  109:     }
  110:     start('td');
  111:   }
  112:   if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') {
  113:     end('p');
  114:   } elsif ($tagname eq 'li') {
  115:     my $ind_li = last_index_of(\@stack, 'li');
  116:     my $ind_ul = last_index_of(\@stack, 'ul');
  117:     my $ind_ol = last_index_of(\@stack, 'ol');
  118:     if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) {
  119:       end('li');
  120:     }
  121:   } elsif ($tagname eq 'tr') {
  122:     my $ind_table = last_index_of(\@stack, 'table');
  123:     my $ind_tr = last_index_of(\@stack, 'tr');
  124:     if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) {
  125:       end('tr');
  126:     }
  127:   } elsif ($tagname eq 'td' || $tagname eq 'th') {
  128:     my $ind_table = last_index_of(\@stack, 'table');
  129:     my $ind_td = last_index_of(\@stack, 'td');
  130:     my $ind_th = last_index_of(\@stack, 'th');
  131:     my $ind_tr = last_index_of(\@stack, 'tr');
  132:     if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) {
  133:       start('tr');
  134:       $ind_tr = last_index_of(\@stack, 'tr');
  135:     }
  136:     if ($ind_td != -1 && $ind_tr < $ind_td) {
  137:       end('td');
  138:     } elsif ($ind_th != -1 && $ind_tr < $ind_th) {
  139:       end('th');
  140:     }
  141:   } elsif ($tagname eq 'dd' || $tagname eq 'dt') {
  142:     my $ind_dd = last_index_of(\@stack, 'dd');
  143:     my $ind_dt = last_index_of(\@stack, 'dt');
  144:     my $ind_dl = last_index_of(\@stack, 'dl');
  145:     if ($ind_dl == -1) {
  146:       start('dl');
  147:       $ind_dl = last_index_of(\@stack, 'dl');
  148:     }
  149:     if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) {
  150:       end('dd');
  151:     } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) {
  152:       end('dt');
  153:     }
  154:   } elsif ($tagname eq 'option') {
  155:     my $ind_option = last_index_of(\@stack, 'option');
  156:     if ($ind_option != -1) {
  157:       end('option');
  158:     }
  159:   } elsif ($tagname eq 'area') {
  160:     my $ind_area = last_index_of(\@stack, 'area');
  161:     if ($ind_area != -1) {
  162:       end('area');
  163:     }
  164:   } elsif ($tagname eq 'a') {
  165:     my $ind_a = last_index_of(\@stack, 'a');
  166:     if ($ind_a != -1) {
  167:       end('a');
  168:     }
  169:   } elsif ($tagname eq 'num') {
  170:     my $ind_num = last_index_of(\@stack, 'num');
  171:     if ($ind_num != -1) {
  172:       end('num');
  173:     }
  174:   }
  175: 
  176: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway).
  177: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead.
  178: # Paragraphs inside paragraphs will be fixed later.
  179: 
  180: #   my @styles = ();
  181: #   if ($tagname eq 'p') {
  182: #     for (my $i=scalar(@stack)-1; $i>=0; $i--) {
  183: #       if ($stack[$i] eq 'p') {
  184: #         # save the styles
  185: #         for (my $j=$i+1; $j<scalar(@stack); $j++) {
  186: #           if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) {
  187: #             push(@styles, $stack[$j]);
  188: #           }
  189: #         }
  190: #         # close the p
  191: #         end('p');
  192: #         last;
  193: #       } elsif (index_of(\@block_html, $stack[$i]) != -1) {
  194: #         # stop looking
  195: #         last;
  196: #       }
  197: #     }
  198: #   }
  199:   $result .= '<'.$tagname;
  200:   my %seen = ();
  201:   foreach my $att_name (@$attrseq) {
  202:     my $att_name_modified = $att_name;
  203:     $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g;
  204:     $att_name_modified =~ s/^[\-.0-9]*//;
  205:     if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) {
  206:       if ($seen{$att_name_modified}) {
  207:         if ($warnings) {
  208:           print "Warning: Ignoring duplicate attribute: $att_name\n";
  209:         }
  210:         next;
  211:       }
  212:       $seen{$att_name_modified}++;
  213:       my $att_value = $attr->{$att_name};
  214:       $att_value =~ s/^[“”]|[“”]$//g;
  215:       $att_value =~ s/&/&amp;/g;
  216:       $att_value =~ s/</&lt;/g;
  217:       $att_value =~ s/>/&gt;/g;
  218:       $att_value =~ s/"/&quot;/g;
  219:       if ($tagname eq 'embed' && $att_name_modified eq 'script') {
  220:         # newlines are encoded to preserve Protein Explorer scripts in embed script attributes:
  221:         $att_value =~ s/\x0A/&#xA;/g;
  222:         $att_value =~ s/\x0D/&#xD;/g;
  223:       }
  224:       if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' ||
  225:           $att_value eq 'http://www.w3.org/TR/REC-html40')) {
  226:         next;
  227:       }
  228:       $result .= ' '.$att_name_modified.'="'.$att_value.'"';
  229:     }
  230:   }
  231:   if (index_of(\@empty, $tagname) != -1) {
  232:     $result .= '/>';
  233:   } else {
  234:     $result .= '>';
  235:     push(@stack, $tagname);
  236:     if (scalar(@stack) > 500) {
  237:       die "This document has a crazy depth - I'm out !";
  238:     }
  239:   }
  240:   # reopen the styles, if any
  241:   #for (my $j=0; $j<scalar(@styles); $j++) {
  242:   #  start($styles[$j], {}, ());
  243:   #}
  244: }
  245: 
  246: sub end {
  247:   my($tagname) = @_;
  248:   
  249:   if ($tagname eq 'o:p') {
  250:     return;
  251:   }
  252:   
  253:   $tagname = fix_tag($tagname);
  254:   if (index_of(\@empty, $tagname) != -1) {
  255:     return;
  256:   }
  257:   if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') {
  258:     # handle <th>text</td> as if it was <th>text</th>
  259:     $tagname = 'th';
  260:   } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') {
  261:     # handle <td>text</th> as if it was <td>text</td>
  262:     $tagname = 'td';
  263:   }
  264:   my $found = 0;
  265:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
  266:     if ($stack[$i] eq $tagname) {
  267:       for (my $j=scalar(@stack)-1; $j>$i; $j--) {
  268:         if ($close_warning ne '') {
  269:           $close_warning .= ', ';
  270:         }
  271:         $close_warning .= $stack[$j];
  272:         $result .= '</'.$stack[$j].'>';
  273:       }
  274:       splice(@stack, $i, scalar(@stack)-$i);
  275:       $found = 1;
  276:       last;
  277:     } elsif (index_of(\@stack, 'web') != -1) {
  278:       die "There is a web element with missing end tags inside - it has to be fixed by hand";
  279:     }
  280:   }
  281:   if ($found) {
  282:     $result .= '</'.$tagname.'>';
  283:   } elsif ($tagname eq 'p') {
  284:     $result .= '<p/>';
  285:   }
  286: }
  287: 
  288: sub text {
  289:   my($dtext) = @_;
  290:   $dtext =~ s/&/&amp;/g;
  291:   $dtext =~ s/</&lt;/g;
  292:   $dtext =~ s/>/&gt;/g;
  293:   $dtext =~ s/"/&quot;/g;
  294:   $result .= $dtext;
  295: }
  296: 
  297: sub comment {
  298:   my($tokens) = @_;
  299:   # NOTE: the HTML parser thinks this is a comment: </ br>
  300:   # and LON-CAPA has sometimes turned that into <![CDATA[</ br>]]>
  301:   foreach my $comment (@$tokens) {
  302:     $comment =~ s/--/- /g;
  303:     $comment =~ s/^-|-$/ /g;
  304:     $result .= '<!--'.$comment.'-->';
  305:   }
  306: }
  307: 
  308: sub declaration {
  309:   my($tokens) = @_;
  310:   # ignore them
  311:   #$result .= '<!';
  312:   #$result .= join(' ', @$tokens);
  313:   #$result .= '>';
  314: }
  315: 
  316: sub process {
  317:   my($token0) = @_;
  318:   if ($token0 ne '') {
  319:     $result .= '<?'.$token0.'>';
  320:   }
  321: }
  322: 
  323: sub index_of {
  324:   my ($array, $value) = @_;
  325:   for (my $i=0; $i<scalar(@{$array}); $i++) {
  326:     if ($array->[$i] eq $value) {
  327:       return $i;
  328:     }
  329:   }
  330:   return -1;
  331: }
  332: 
  333: sub last_index_of {
  334:   my ($array, $value) = @_;
  335:   for (my $i=scalar(@{$array})-1; $i>=0; $i--) {
  336:     if ($array->[$i] eq $value) {
  337:       return $i;
  338:     }
  339:   }
  340:   return -1;
  341: }
  342: 
  343: sub fix_tag {
  344:   my ($tag) = @_;
  345:   #$tag = lc($tag); this is done by default by the parser
  346:   if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) {
  347:     if ($warnings) {
  348:       print "Warning: bad start tag:'".$tag."'";
  349:     }
  350:     if ($tag =~ /<[a-zA-Z]/) {
  351:       $tag =~ s/^[^<]*<//; # a<b -> b
  352:     }
  353:     if ($tag =~ /[a-zA-Z]=/) {
  354:       $tag =~ s/=.*$//; # a=b -> a
  355:     }
  356:     if ($tag =~ /[a-zA-Z]\//) {
  357:       $tag =~ s/\/.*$//; # a/b -> a
  358:     }
  359:     if ($tag =~ /:/) {
  360:       # a:b -> b except when : at the end
  361:       if ($tag =~ /^[^:]*:$/) {
  362:         $tag =~ s/://;
  363:       } else {
  364:         $tag =~ s/^.*://;
  365:       }
  366:     }
  367:     $tag =~ s/^[0-9\-\.]+//;
  368:     $tag =~ s/[^a-zA-Z0-9_\-\.]//g;
  369:     if ($warnings) {
  370:       print " (converted to $tag)\n";
  371:     }
  372:   }
  373:   return($tag);
  374: }
  375: 
  376: 
  377: ##
  378: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
  379: # @param {Array<string>} array - reference to the array of strings
  380: # @param {string} value - the string to look for
  381: # @returns 1 if found, 0 otherwise
  382: ##
  383: sub string_in_array {
  384:   my ($array, $value) = @_;
  385:   foreach my $v (@{$array}) {
  386:     if ($v eq $value) {
  387:       return 1;
  388:     }
  389:   }
  390:   return 0;
  391: }
  392: 
  393: 
  394: 1;
  395: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>