File:  [LON-CAPA] / loncom / homework / cleanxml / html_to_xml.pm
Revision 1.2: download - view: text, annotated - select for diffs
Wed Jan 6 16:44:32 2016 UTC (6 years, 10 months ago) by damieng
Branches: MAIN
CVS tags: version_2_12_X, HEAD
Added Daxe editing for HTML documents, fixed issues with case mixing in task documents

    1: # The LearningOnline Network
    2: # Second step to clean a file.
    3: #
    4: # $Id: html_to_xml.pm,v 1.2 2016/01/06 16:44:32 damieng Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ###
   29: 
   30: #!/usr/bin/perl
   31: 
   32: 
   33: package Apache::html_to_xml;
   34: 
   35: use strict;
   36: use utf8;
   37: use warnings;
   38: use HTML::Parser ();
   39: 
   40: # always closing, end tags are ignored:
   41: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext');
   42: 
   43: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe');
   44: 
   45: 
   46: my $result;
   47: my @stack;
   48: my $close_warning;
   49: my $warnings; # 1 = print warnings
   50: 
   51: 
   52: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML.
   53: sub html_to_xml {
   54:   my($textref, $warn, $case_sensitive) = @_;
   55:   $warnings = $warn;
   56:   if (!defined $case_sensitive) {
   57:     $case_sensitive = 0;
   58:   }
   59:   $result = '';
   60:   @stack = ();
   61:   $close_warning = '';
   62:   my $p = HTML::Parser->new( api_version => 3,
   63:                           start_h => [\&start, "tagname, attr, attrseq"],
   64:                           end_h   => [\&end,   "tagname"],
   65:                           text_h  => [\&text, "dtext"],
   66:                           comment_h  => [\&comment, "tokens"],
   67:                           declaration_h  => [\&declaration, "tokens"],
   68:                           process_h  => [\&process, "token0"],
   69:                         );
   70:   # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase
   71:   # This is a problem with the Task elements, so it is disabled in that case
   72:   if ($case_sensitive) {
   73:     $p->case_sensitive(1);
   74:   }
   75:   $p->empty_element_tags(1);
   76:   $result .= "<?xml version='1.0' encoding='UTF-8'?>\n";
   77:   $p->parse($$textref);
   78:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
   79:     if ($close_warning ne '') {
   80:       $close_warning .= ', ';
   81:     }
   82:     $close_warning .= $stack[$i];
   83:     $result .= '</'.$stack[$i].'>';
   84:   }
   85:   if ($warnings && $close_warning ne '') {
   86:     print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n";
   87:   }
   88:   return \$result;
   89: }
   90: 
   91: sub start {
   92:   my($tagname, $attr, $attrseq) = @_;
   93:   
   94:   # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files
   95:   # (in problem files there are not so many missing tags)
   96:   # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
   97:   
   98:   if ($tagname eq 'o:p') {
   99:     return;
  100:   }
  101:   
  102:   if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname>
  103:     $result .= "&lt;$tagname&gt;";
  104:     return;
  105:   }
  106:   
  107:   #$tagname = lc($tagname); this is done by default by the parser
  108:   $tagname = fix_tag($tagname);
  109:   if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' &&
  110:       !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) {
  111:     # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse
  112:     # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><FONT FACE="Palatino"><TD...
  113:     # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is)
  114:     if ($warnings) {
  115:       print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n";
  116:     }
  117:     start('td');
  118:   }
  119:   if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') {
  120:     end('p');
  121:   } elsif ($tagname eq 'li') {
  122:     my $ind_li = last_index_of(\@stack, 'li');
  123:     my $ind_ul = last_index_of(\@stack, 'ul');
  124:     my $ind_ol = last_index_of(\@stack, 'ol');
  125:     if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) {
  126:       end('li');
  127:     }
  128:   } elsif ($tagname eq 'tr') {
  129:     my $ind_table = last_index_of(\@stack, 'table');
  130:     my $ind_tr = last_index_of(\@stack, 'tr');
  131:     if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) {
  132:       end('tr');
  133:     }
  134:   } elsif ($tagname eq 'td' || $tagname eq 'th') {
  135:     my $ind_table = last_index_of(\@stack, 'table');
  136:     my $ind_td = last_index_of(\@stack, 'td');
  137:     my $ind_th = last_index_of(\@stack, 'th');
  138:     my $ind_tr = last_index_of(\@stack, 'tr');
  139:     if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) {
  140:       start('tr');
  141:       $ind_tr = last_index_of(\@stack, 'tr');
  142:     }
  143:     if ($ind_td != -1 && $ind_tr < $ind_td) {
  144:       end('td');
  145:     } elsif ($ind_th != -1 && $ind_tr < $ind_th) {
  146:       end('th');
  147:     }
  148:   } elsif ($tagname eq 'dd' || $tagname eq 'dt') {
  149:     my $ind_dd = last_index_of(\@stack, 'dd');
  150:     my $ind_dt = last_index_of(\@stack, 'dt');
  151:     my $ind_dl = last_index_of(\@stack, 'dl');
  152:     if ($ind_dl == -1) {
  153:       start('dl');
  154:       $ind_dl = last_index_of(\@stack, 'dl');
  155:     }
  156:     if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) {
  157:       end('dd');
  158:     } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) {
  159:       end('dt');
  160:     }
  161:   } elsif ($tagname eq 'option') {
  162:     my $ind_option = last_index_of(\@stack, 'option');
  163:     if ($ind_option != -1) {
  164:       end('option');
  165:     }
  166:   } elsif ($tagname eq 'area') {
  167:     my $ind_area = last_index_of(\@stack, 'area');
  168:     if ($ind_area != -1) {
  169:       end('area');
  170:     }
  171:   } elsif ($tagname eq 'a') {
  172:     my $ind_a = last_index_of(\@stack, 'a');
  173:     if ($ind_a != -1) {
  174:       end('a');
  175:     }
  176:   } elsif ($tagname eq 'num') {
  177:     my $ind_num = last_index_of(\@stack, 'num');
  178:     if ($ind_num != -1) {
  179:       end('num');
  180:     }
  181:   }
  182: 
  183: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway).
  184: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead.
  185: # Paragraphs inside paragraphs will be fixed later.
  186: 
  187: #   my @styles = ();
  188: #   if ($tagname eq 'p') {
  189: #     for (my $i=scalar(@stack)-1; $i>=0; $i--) {
  190: #       if ($stack[$i] eq 'p') {
  191: #         # save the styles
  192: #         for (my $j=$i+1; $j<scalar(@stack); $j++) {
  193: #           if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) {
  194: #             push(@styles, $stack[$j]);
  195: #           }
  196: #         }
  197: #         # close the p
  198: #         end('p');
  199: #         last;
  200: #       } elsif (index_of(\@block_html, $stack[$i]) != -1) {
  201: #         # stop looking
  202: #         last;
  203: #       }
  204: #     }
  205: #   }
  206:   $result .= '<'.$tagname;
  207:   my %seen = ();
  208:   foreach my $att_name (@$attrseq) {
  209:     my $att_name_modified = $att_name;
  210:     $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g;
  211:     $att_name_modified =~ s/^[\-.0-9]*//;
  212:     if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) {
  213:       if ($seen{$att_name_modified}) {
  214:         if ($warnings) {
  215:           print "Warning: Ignoring duplicate attribute: $att_name\n";
  216:         }
  217:         next;
  218:       }
  219:       $seen{$att_name_modified}++;
  220:       my $att_value = $attr->{$att_name};
  221:       $att_value =~ s/^[“”]|[“”]$//g;
  222:       $att_value =~ s/&/&amp;/g;
  223:       $att_value =~ s/</&lt;/g;
  224:       $att_value =~ s/>/&gt;/g;
  225:       $att_value =~ s/"/&quot;/g;
  226:       if ($tagname eq 'embed' && $att_name_modified eq 'script') {
  227:         # newlines are encoded to preserve Protein Explorer scripts in embed script attributes:
  228:         $att_value =~ s/\x0A/&#xA;/g;
  229:         $att_value =~ s/\x0D/&#xD;/g;
  230:       }
  231:       if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' ||
  232:           $att_value eq 'http://www.w3.org/TR/REC-html40')) {
  233:         next;
  234:       }
  235:       $result .= ' '.$att_name_modified.'="'.$att_value.'"';
  236:     }
  237:   }
  238:   if (index_of(\@empty, $tagname) != -1) {
  239:     $result .= '/>';
  240:   } else {
  241:     $result .= '>';
  242:     push(@stack, $tagname);
  243:     if (scalar(@stack) > 500) {
  244:       die "This document has a crazy depth - I'm out !";
  245:     }
  246:   }
  247:   # reopen the styles, if any
  248:   #for (my $j=0; $j<scalar(@styles); $j++) {
  249:   #  start($styles[$j], {}, ());
  250:   #}
  251: }
  252: 
  253: sub end {
  254:   my($tagname) = @_;
  255:   
  256:   if ($tagname eq 'o:p') {
  257:     return;
  258:   }
  259:   
  260:   $tagname = fix_tag($tagname);
  261:   if (index_of(\@empty, $tagname) != -1) {
  262:     return;
  263:   }
  264:   if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') {
  265:     # handle <th>text</td> as if it was <th>text</th>
  266:     $tagname = 'th';
  267:   } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') {
  268:     # handle <td>text</th> as if it was <td>text</td>
  269:     $tagname = 'td';
  270:   }
  271:   my $found = 0;
  272:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
  273:     if ($stack[$i] eq $tagname) {
  274:       for (my $j=scalar(@stack)-1; $j>$i; $j--) {
  275:         if ($close_warning ne '') {
  276:           $close_warning .= ', ';
  277:         }
  278:         $close_warning .= $stack[$j];
  279:         $result .= '</'.$stack[$j].'>';
  280:       }
  281:       splice(@stack, $i, scalar(@stack)-$i);
  282:       $found = 1;
  283:       last;
  284:     } elsif (index_of(\@stack, 'web') != -1) {
  285:       die "There is a web element with missing end tags inside - it has to be fixed by hand";
  286:     }
  287:   }
  288:   if ($found) {
  289:     $result .= '</'.$tagname.'>';
  290:   } elsif ($tagname eq 'p') {
  291:     $result .= '<p/>';
  292:   }
  293: }
  294: 
  295: sub text {
  296:   my($dtext) = @_;
  297:   $dtext =~ s/&/&amp;/g;
  298:   $dtext =~ s/</&lt;/g;
  299:   $dtext =~ s/>/&gt;/g;
  300:   $dtext =~ s/"/&quot;/g;
  301:   $result .= $dtext;
  302: }
  303: 
  304: sub comment {
  305:   my($tokens) = @_;
  306:   # NOTE: the HTML parser thinks this is a comment: </ br>
  307:   # and LON-CAPA has sometimes turned that into <![CDATA[</ br>]]>
  308:   foreach my $comment (@$tokens) {
  309:     $comment =~ s/--/- /g;
  310:     $comment =~ s/^-|-$/ /g;
  311:     $result .= '<!--'.$comment.'-->';
  312:   }
  313: }
  314: 
  315: sub declaration {
  316:   my($tokens) = @_;
  317:   # ignore them
  318:   #$result .= '<!';
  319:   #$result .= join(' ', @$tokens);
  320:   #$result .= '>';
  321: }
  322: 
  323: sub process {
  324:   my($token0) = @_;
  325:   if ($token0 ne '') {
  326:     $result .= '<?'.$token0.'>';
  327:   }
  328: }
  329: 
  330: sub index_of {
  331:   my ($array, $value) = @_;
  332:   for (my $i=0; $i<scalar(@{$array}); $i++) {
  333:     if ($array->[$i] eq $value) {
  334:       return $i;
  335:     }
  336:   }
  337:   return -1;
  338: }
  339: 
  340: sub last_index_of {
  341:   my ($array, $value) = @_;
  342:   for (my $i=scalar(@{$array})-1; $i>=0; $i--) {
  343:     if ($array->[$i] eq $value) {
  344:       return $i;
  345:     }
  346:   }
  347:   return -1;
  348: }
  349: 
  350: sub fix_tag {
  351:   my ($tag) = @_;
  352:   #$tag = lc($tag); this is done by default by the parser
  353:   if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) {
  354:     if ($warnings) {
  355:       print "Warning: bad start tag:'".$tag."'";
  356:     }
  357:     if ($tag =~ /<[a-zA-Z]/) {
  358:       $tag =~ s/^[^<]*<//; # a<b -> b
  359:     }
  360:     if ($tag =~ /[a-zA-Z]=/) {
  361:       $tag =~ s/=.*$//; # a=b -> a
  362:     }
  363:     if ($tag =~ /[a-zA-Z]\//) {
  364:       $tag =~ s/\/.*$//; # a/b -> a
  365:     }
  366:     if ($tag =~ /:/) {
  367:       # a:b -> b except when : at the end
  368:       if ($tag =~ /^[^:]*:$/) {
  369:         $tag =~ s/://;
  370:       } else {
  371:         $tag =~ s/^.*://;
  372:       }
  373:     }
  374:     $tag =~ s/^[0-9\-\.]+//;
  375:     $tag =~ s/[^a-zA-Z0-9_\-\.]//g;
  376:     if ($warnings) {
  377:       print " (converted to $tag)\n";
  378:     }
  379:   }
  380:   return($tag);
  381: }
  382: 
  383: 
  384: ##
  385: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
  386: # @param {Array<string>} array - reference to the array of strings
  387: # @param {string} value - the string to look for
  388: # @returns 1 if found, 0 otherwise
  389: ##
  390: sub string_in_array {
  391:   my ($array, $value) = @_;
  392:   foreach my $v (@{$array}) {
  393:     if ($v eq $value) {
  394:       return 1;
  395:     }
  396:   }
  397:   return 0;
  398: }
  399: 
  400: 
  401: 1;
  402: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>