loncom/homework/cleanxml/html_to_xml.pm - annotate

Return to html_to_xml.pm CVS log
Up to [LON-CAPA] / loncom / homework / cleanxml
Annotation of loncom/homework/cleanxml/html_to_xml.pm, revision 1.1

1.1     ! damieng     1: # The LearningOnline Network
        !             2: # Second step to clean a file.
        !             3: #
        !             4: # $Id$
        !             5: #
        !             6: # Copyright Michigan State University Board of Trustees
        !             7: #
        !             8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
        !             9: #
        !            10: # LON-CAPA is free software; you can redistribute it and/or modify
        !            11: # it under the terms of the GNU General Public License as published by
        !            12: # the Free Software Foundation; either version 2 of the License, or
        !            13: # (at your option) any later version.
        !            14: #
        !            15: # LON-CAPA is distributed in the hope that it will be useful,
        !            16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
        !            18: # GNU General Public License for more details.
        !            19: #
        !            20: # You should have received a copy of the GNU General Public License
        !            21: # along with LON-CAPA; if not, write to the Free Software
        !            22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
        !            23: #
        !            24: # /home/httpd/html/adm/gpl.txt
        !            25: #
        !            26: # http://www.lon-capa.org/
        !            27: #
        !            28: ###
        !            29: 
        !            30: #!/usr/bin/perl
        !            31: 
        !            32: 
        !            33: package Apache::html_to_xml;
        !            34: 
        !            35: use strict;
        !            36: use utf8;
        !            37: use warnings;
        !            38: use HTML::Parser ();
        !            39: 
        !            40: # always closing, end tags are ignored:
        !            41: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext');
        !            42: 
        !            43: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe');
        !            44: 
        !            45: 
        !            46: my $result;
        !            47: my @stack;
        !            48: my $close_warning;
        !            49: my $warnings; # 1 = print warnings
        !            50: 
        !            51: 
        !            52: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML.
        !            53: sub html_to_xml {
        !            54:   my($textref, $warn) = @_;
        !            55:   $warnings = $warn;
        !            56:   $result = '';
        !            57:   @stack = ();
        !            58:   $close_warning = '';
        !            59:   my $p = HTML::Parser->new( api_version => 3,
        !            60:                           start_h => [\&start, "tagname, attr, attrseq"],
        !            61:                           end_h   => [\&end,   "tagname"],
        !            62:                           text_h  => [\&text, "dtext"],
        !            63:                           comment_h  => [\&comment, "tokens"],
        !            64:                           declaration_h  => [\&declaration, "tokens"],
        !            65:                           process_h  => [\&process, "token0"],
        !            66:                         );
        !            67:   # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase
        !            68:   $p->empty_element_tags(1);
        !            69:   $result .= "<?xml version='1.0' encoding='UTF-8'?>\n";
        !            70:   $p->parse($$textref);
        !            71:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
        !            72:     if ($close_warning ne '') {
        !            73:       $close_warning .= ', ';
        !            74:     }
        !            75:     $close_warning .= $stack[$i];
        !            76:     $result .= '</'.$stack[$i].'>';
        !            77:   }
        !            78:   if ($warnings && $close_warning ne '') {
        !            79:     print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n";
        !            80:   }
        !            81:   return \$result;
        !            82: }
        !            83: 
        !            84: sub start {
        !            85:   my($tagname, $attr, $attrseq) = @_;
        !            86:   
        !            87:   # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files
        !            88:   # (in problem files there are not so many missing tags)
        !            89:   # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
        !            90:   
        !            91:   if ($tagname eq 'o:p') {
        !            92:     return;
        !            93:   }
        !            94:   
        !            95:   if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname>
        !            96:     $result .= "&lt;$tagname&gt;";
        !            97:     return;
        !            98:   }
        !            99:   
        !           100:   #$tagname = lc($tagname); this is done by default by the parser
        !           101:   $tagname = fix_tag($tagname);
        !           102:   if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' &&
        !           103:       !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) {
        !           104:     # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse
        !           105:     # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><FONT FACE="Palatino"><TD...
        !           106:     # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is)
        !           107:     if ($warnings) {
        !           108:       print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n";
        !           109:     }
        !           110:     start('td');
        !           111:   }
        !           112:   if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') {
        !           113:     end('p');
        !           114:   } elsif ($tagname eq 'li') {
        !           115:     my $ind_li = last_index_of(\@stack, 'li');
        !           116:     my $ind_ul = last_index_of(\@stack, 'ul');
        !           117:     my $ind_ol = last_index_of(\@stack, 'ol');
        !           118:     if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) {
        !           119:       end('li');
        !           120:     }
        !           121:   } elsif ($tagname eq 'tr') {
        !           122:     my $ind_table = last_index_of(\@stack, 'table');
        !           123:     my $ind_tr = last_index_of(\@stack, 'tr');
        !           124:     if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) {
        !           125:       end('tr');
        !           126:     }
        !           127:   } elsif ($tagname eq 'td' || $tagname eq 'th') {
        !           128:     my $ind_table = last_index_of(\@stack, 'table');
        !           129:     my $ind_td = last_index_of(\@stack, 'td');
        !           130:     my $ind_th = last_index_of(\@stack, 'th');
        !           131:     my $ind_tr = last_index_of(\@stack, 'tr');
        !           132:     if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) {
        !           133:       start('tr');
        !           134:       $ind_tr = last_index_of(\@stack, 'tr');
        !           135:     }
        !           136:     if ($ind_td != -1 && $ind_tr < $ind_td) {
        !           137:       end('td');
        !           138:     } elsif ($ind_th != -1 && $ind_tr < $ind_th) {
        !           139:       end('th');
        !           140:     }
        !           141:   } elsif ($tagname eq 'dd' || $tagname eq 'dt') {
        !           142:     my $ind_dd = last_index_of(\@stack, 'dd');
        !           143:     my $ind_dt = last_index_of(\@stack, 'dt');
        !           144:     my $ind_dl = last_index_of(\@stack, 'dl');
        !           145:     if ($ind_dl == -1) {
        !           146:       start('dl');
        !           147:       $ind_dl = last_index_of(\@stack, 'dl');
        !           148:     }
        !           149:     if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) {
        !           150:       end('dd');
        !           151:     } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) {
        !           152:       end('dt');
        !           153:     }
        !           154:   } elsif ($tagname eq 'option') {
        !           155:     my $ind_option = last_index_of(\@stack, 'option');
        !           156:     if ($ind_option != -1) {
        !           157:       end('option');
        !           158:     }
        !           159:   } elsif ($tagname eq 'area') {
        !           160:     my $ind_area = last_index_of(\@stack, 'area');
        !           161:     if ($ind_area != -1) {
        !           162:       end('area');
        !           163:     }
        !           164:   } elsif ($tagname eq 'a') {
        !           165:     my $ind_a = last_index_of(\@stack, 'a');
        !           166:     if ($ind_a != -1) {
        !           167:       end('a');
        !           168:     }
        !           169:   } elsif ($tagname eq 'num') {
        !           170:     my $ind_num = last_index_of(\@stack, 'num');
        !           171:     if ($ind_num != -1) {
        !           172:       end('num');
        !           173:     }
        !           174:   }
        !           175: 
        !           176: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway).
        !           177: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead.
        !           178: # Paragraphs inside paragraphs will be fixed later.
        !           179: 
        !           180: #   my @styles = ();
        !           181: #   if ($tagname eq 'p') {
        !           182: #     for (my $i=scalar(@stack)-1; $i>=0; $i--) {
        !           183: #       if ($stack[$i] eq 'p') {
        !           184: #         # save the styles
        !           185: #         for (my $j=$i+1; $j<scalar(@stack); $j++) {
        !           186: #           if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) {
        !           187: #             push(@styles, $stack[$j]);
        !           188: #           }
        !           189: #         }
        !           190: #         # close the p
        !           191: #         end('p');
        !           192: #         last;
        !           193: #       } elsif (index_of(\@block_html, $stack[$i]) != -1) {
        !           194: #         # stop looking
        !           195: #         last;
        !           196: #       }
        !           197: #     }
        !           198: #   }
        !           199:   $result .= '<'.$tagname;
        !           200:   my %seen = ();
        !           201:   foreach my $att_name (@$attrseq) {
        !           202:     my $att_name_modified = $att_name;
        !           203:     $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g;
        !           204:     $att_name_modified =~ s/^[\-.0-9]*//;
        !           205:     if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) {
        !           206:       if ($seen{$att_name_modified}) {
        !           207:         if ($warnings) {
        !           208:           print "Warning: Ignoring duplicate attribute: $att_name\n";
        !           209:         }
        !           210:         next;
        !           211:       }
        !           212:       $seen{$att_name_modified}++;
        !           213:       my $att_value = $attr->{$att_name};
        !           214:       $att_value =~ s/^[“”]|[“”]$//g;
        !           215:       $att_value =~ s/&/&amp;/g;
        !           216:       $att_value =~ s/</&lt;/g;
        !           217:       $att_value =~ s/>/&gt;/g;
        !           218:       $att_value =~ s/"/&quot;/g;
        !           219:       if ($tagname eq 'embed' && $att_name_modified eq 'script') {
        !           220:         # newlines are encoded to preserve Protein Explorer scripts in embed script attributes:
        !           221:         $att_value =~ s/\x0A/&#xA;/g;
        !           222:         $att_value =~ s/\x0D/&#xD;/g;
        !           223:       }
        !           224:       if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' ||
        !           225:           $att_value eq 'http://www.w3.org/TR/REC-html40')) {
        !           226:         next;
        !           227:       }
        !           228:       $result .= ' '.$att_name_modified.'="'.$att_value.'"';
        !           229:     }
        !           230:   }
        !           231:   if (index_of(\@empty, $tagname) != -1) {
        !           232:     $result .= '/>';
        !           233:   } else {
        !           234:     $result .= '>';
        !           235:     push(@stack, $tagname);
        !           236:     if (scalar(@stack) > 500) {
        !           237:       die "This document has a crazy depth - I'm out !";
        !           238:     }
        !           239:   }
        !           240:   # reopen the styles, if any
        !           241:   #for (my $j=0; $j<scalar(@styles); $j++) {
        !           242:   #  start($styles[$j], {}, ());
        !           243:   #}
        !           244: }
        !           245: 
        !           246: sub end {
        !           247:   my($tagname) = @_;
        !           248:   
        !           249:   if ($tagname eq 'o:p') {
        !           250:     return;
        !           251:   }
        !           252:   
        !           253:   $tagname = fix_tag($tagname);
        !           254:   if (index_of(\@empty, $tagname) != -1) {
        !           255:     return;
        !           256:   }
        !           257:   if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') {
        !           258:     # handle <th>text</td> as if it was <th>text</th>
        !           259:     $tagname = 'th';
        !           260:   } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') {
        !           261:     # handle <td>text</th> as if it was <td>text</td>
        !           262:     $tagname = 'td';
        !           263:   }
        !           264:   my $found = 0;
        !           265:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
        !           266:     if ($stack[$i] eq $tagname) {
        !           267:       for (my $j=scalar(@stack)-1; $j>$i; $j--) {
        !           268:         if ($close_warning ne '') {
        !           269:           $close_warning .= ', ';
        !           270:         }
        !           271:         $close_warning .= $stack[$j];
        !           272:         $result .= '</'.$stack[$j].'>';
        !           273:       }
        !           274:       splice(@stack, $i, scalar(@stack)-$i);
        !           275:       $found = 1;
        !           276:       last;
        !           277:     } elsif (index_of(\@stack, 'web') != -1) {
        !           278:       die "There is a web element with missing end tags inside - it has to be fixed by hand";
        !           279:     }
        !           280:   }
        !           281:   if ($found) {
        !           282:     $result .= '</'.$tagname.'>';
        !           283:   } elsif ($tagname eq 'p') {
        !           284:     $result .= '<p/>';
        !           285:   }
        !           286: }
        !           287: 
        !           288: sub text {
        !           289:   my($dtext) = @_;
        !           290:   $dtext =~ s/&/&amp;/g;
        !           291:   $dtext =~ s/</&lt;/g;
        !           292:   $dtext =~ s/>/&gt;/g;
        !           293:   $dtext =~ s/"/&quot;/g;
        !           294:   $result .= $dtext;
        !           295: }
        !           296: 
        !           297: sub comment {
        !           298:   my($tokens) = @_;
        !           299:   # NOTE: the HTML parser thinks this is a comment: </ br>
        !           300:   # and LON-CAPA has sometimes turned that into <![CDATA[</ br>]]>
        !           301:   foreach my $comment (@$tokens) {
        !           302:     $comment =~ s/--/- /g;
        !           303:     $comment =~ s/^-|-$/ /g;
        !           304:     $result .= '<!--'.$comment.'-->';
        !           305:   }
        !           306: }
        !           307: 
        !           308: sub declaration {
        !           309:   my($tokens) = @_;
        !           310:   # ignore them
        !           311:   #$result .= '<!';
        !           312:   #$result .= join(' ', @$tokens);
        !           313:   #$result .= '>';
        !           314: }
        !           315: 
        !           316: sub process {
        !           317:   my($token0) = @_;
        !           318:   if ($token0 ne '') {
        !           319:     $result .= '<?'.$token0.'>';
        !           320:   }
        !           321: }
        !           322: 
        !           323: sub index_of {
        !           324:   my ($array, $value) = @_;
        !           325:   for (my $i=0; $i<scalar(@{$array}); $i++) {
        !           326:     if ($array->[$i] eq $value) {
        !           327:       return $i;
        !           328:     }
        !           329:   }
        !           330:   return -1;
        !           331: }
        !           332: 
        !           333: sub last_index_of {
        !           334:   my ($array, $value) = @_;
        !           335:   for (my $i=scalar(@{$array})-1; $i>=0; $i--) {
        !           336:     if ($array->[$i] eq $value) {
        !           337:       return $i;
        !           338:     }
        !           339:   }
        !           340:   return -1;
        !           341: }
        !           342: 
        !           343: sub fix_tag {
        !           344:   my ($tag) = @_;
        !           345:   #$tag = lc($tag); this is done by default by the parser
        !           346:   if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) {
        !           347:     if ($warnings) {
        !           348:       print "Warning: bad start tag:'".$tag."'";
        !           349:     }
        !           350:     if ($tag =~ /<[a-zA-Z]/) {
        !           351:       $tag =~ s/^[^<]*<//; # a<b -> b
        !           352:     }
        !           353:     if ($tag =~ /[a-zA-Z]=/) {
        !           354:       $tag =~ s/=.*$//; # a=b -> a
        !           355:     }
        !           356:     if ($tag =~ /[a-zA-Z]\//) {
        !           357:       $tag =~ s/\/.*$//; # a/b -> a
        !           358:     }
        !           359:     if ($tag =~ /:/) {
        !           360:       # a:b -> b except when : at the end
        !           361:       if ($tag =~ /^[^:]*:$/) {
        !           362:         $tag =~ s/://;
        !           363:       } else {
        !           364:         $tag =~ s/^.*://;
        !           365:       }
        !           366:     }
        !           367:     $tag =~ s/^[0-9\-\.]+//;
        !           368:     $tag =~ s/[^a-zA-Z0-9_\-\.]//g;
        !           369:     if ($warnings) {
        !           370:       print " (converted to $tag)\n";
        !           371:     }
        !           372:   }
        !           373:   return($tag);
        !           374: }
        !           375: 
        !           376: 
        !           377: ##
        !           378: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
        !           379: # @param {Array<string>} array - reference to the array of strings
        !           380: # @param {string} value - the string to look for
        !           381: # @returns 1 if found, 0 otherwise
        !           382: ##
        !           383: sub string_in_array {
        !           384:   my ($array, $value) = @_;
        !           385:   foreach my $v (@{$array}) {
        !           386:     if ($v eq $value) {
        !           387:       return 1;
        !           388:     }
        !           389:   }
        !           390:   return 0;
        !           391: }
        !           392: 
        !           393: 
        !           394: 1;
        !           395: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>