File:  [LON-CAPA] / loncom / homework / cleanxml / pre_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Thu Dec 3 20:40:31 2015 UTC (8 years, 4 months ago) by damieng
Branches: MAIN
CVS tags: HEAD
integrated Daxe, opening in a separate window for now

    1: # The LearningOnline Network
    2: # First step to clean a file.
    3: #
    4: # $Id: pre_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ###
   29: 
   30: #!/usr/bin/perl
   31: 
   32: package Apache::pre_xml;
   33: 
   34: use strict;
   35: use utf8;
   36: 
   37: use Encode;
   38: use Encode::Byte;
   39: use Encode::Guess;
   40: 
   41: # list of elements inside which < and > might not be turned into entities
   42: # unfortunately, answer can sometimes contain the elements vector and value...
   43: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
   44: 
   45: my $warnings; # 1 = print warnings
   46: 
   47: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
   48: # and returns the converted text.
   49: sub pre_xml {
   50:   my ($filepath, $warn) = @_;
   51:   $warnings = $warn;
   52:   
   53:   my $lines = guess_encoding_and_read($filepath);
   54: 
   55:   remove_control_characters($lines);
   56:   
   57:   fix_cdata_elements($lines);
   58: 
   59:   fix_html_entities($lines);
   60:   
   61:   fix_missing_quotes($lines);
   62:   
   63:   fix_empty_li($lines);
   64:   
   65:   remove_doctype($lines);
   66:   
   67:   add_root($lines, $filepath);
   68:   
   69:   return(\join('', @$lines));
   70: }
   71: 
   72: 
   73: ##
   74: # Tries to guess the character encoding, and returns the lines as decoded text.
   75: # Requires Encode::Byte.
   76: ##
   77: sub guess_encoding_and_read {
   78:   my ($fn) = @_;
   79:   no warnings "utf8";
   80:   local $/ = undef;
   81:   open(my $fh, "<", $fn) or die "cannot read $fn: $!";
   82:   binmode $fh;
   83:   my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
   84:   close $fh;
   85:   
   86:   if (index($data, '<') == -1) {
   87:     die "This file has no markup !";
   88:   }
   89:   
   90:   # try to get a charset from a meta at the beginning of the file
   91:   my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
   92:   if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
   93:     my $meta_charset = $1;
   94:     if ($meta_charset ne '') {
   95:       if ($meta_charset =~ /iso-?8859-?1/i) {
   96:         # usually a lie
   97:         $meta_charset = 'cp1252';
   98:       }
   99:       # now try to decode using that encoding
  100:       my $decoder = guess_encoding($data, ($meta_charset));
  101:       if (ref($decoder)) {
  102:         my $decoded = $decoder->decode($data);
  103:         my @lines = split(/^/m, $decoded);
  104:         return \@lines;
  105:       } else {
  106:         if ($warnings) {
  107:           print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
  108:         }
  109:       }
  110:     }
  111:   }
  112:   
  113:   my $decoded;
  114:   if (length($data) > 0) {
  115:     # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
  116:     #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
  117:     # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
  118:     my $decoder = Encode::Guess->guess($data);
  119:     if (ref($decoder)) {
  120:       $decoded = $decoder->decode($data);
  121:       # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
  122:     } else {
  123:       if ($warnings) {
  124:         print "Warning: encoding is not UTF-8 for $fn";
  125:       }
  126:       
  127:       # let's try iso-2022-jp first
  128:       $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
  129:       if (ref($decoder)) {
  130:         $decoded = $decoder->decode($data);
  131:         if ($warnings) {
  132:           print "; using iso-2022-jp\n";
  133:         }
  134:       } else {
  135:         # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
  136:         # instead of control codes. We can assume that these control codes are not used, so there
  137:         # is no need to test for iso-8859-1.
  138:         # The main problem here is to distinguish between cp1252 and MacRoman.
  139:         # see http://www.alanwood.net/demos/charsetdiffs.html#f
  140:         my $decoded_windows = decode('cp1252', $data);
  141:         my $decoded_mac = decode('MacRoman', $data);
  142:         # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
  143:         # í has been removed because it conflicts with ’ and ’ is more frequent
  144:         # ± has been removed because it is, suprisingly, the same code in both encodings !
  145:         my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
  146:         my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
  147:         # check newlines too (\r on MacOS < X, \r\n on Windows)
  148:         my $ind_cr = index($data, "\r");
  149:         if ($ind_cr != -1) {
  150:           if (substr($data, $ind_cr + 1, 1) eq "\n") {
  151:             $score_windows++;
  152:           } else {
  153:             $score_mac++;
  154:           }
  155:         }
  156:         if ($score_windows >= $score_mac) {
  157:           $decoded = $decoded_windows;
  158:           if ($warnings) {
  159:             print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
  160:           }
  161:         } else {
  162:           if ($warnings) {
  163:             print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
  164:           }
  165:           $decoded = $decoded_mac;
  166:         }
  167:       }
  168:     }
  169:   } else {
  170:     $decoded = '';
  171:   }
  172:   my @lines = split(/^/m, $decoded);
  173:   return \@lines;
  174: }
  175: 
  176: 
  177: ##
  178: # Removes some control characters
  179: # @param {Array<string>} lines
  180: ##
  181: sub remove_control_characters {
  182:   my ($lines) = @_;
  183:   foreach my $line (@{$lines}) {
  184:     $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
  185:     $line =~ s/&#[0-7];//g;
  186:     $line =~ s/&#1[4-9];//g;
  187:     $line =~ s/&#2[0-9];//g;
  188:   }
  189: }
  190: 
  191: ##
  192: # Replaces < and > characters by &lt; and &gt; in cdata elements (listed in @cdata_elements).
  193: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
  194: # @param {Array<string>} lines
  195: ##
  196: sub fix_cdata_elements {
  197:   my ($lines) = @_;
  198:   my $i = 0;
  199:   my $j = 0;
  200:   my $tag = '';
  201:   my $type;
  202:   my $in_numericalresponse = 0;
  203:   my $in_formularesponse = 0;
  204:   my $in_script = 0;
  205:   ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
  206:   while ($tag ne '') {
  207:     if ($tag eq 'numericalresponse') {
  208:       if ($type eq 'start') {
  209:         $in_numericalresponse = 1;
  210:       } else {
  211:         $in_numericalresponse = 0;
  212:       }
  213:     } elsif ($tag eq 'formularesponse') {
  214:       if ($type eq 'start') {
  215:         $in_formularesponse = 1;
  216:       } else {
  217:         $in_formularesponse = 0;
  218:       }
  219:     } elsif ($tag eq 'script') {
  220:       if ($type eq 'start') {
  221:         $in_script = 1;
  222:       } else {
  223:         $in_script = 0;
  224:       }
  225:     }
  226:     if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
  227:         ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
  228:       my $cde = $tag;
  229:       my $line = $lines->[$i];
  230:       $j = index($line, '>', $j+1) + 1;
  231:       my $stop = 0;
  232:       while (!$stop && $i < scalar(@{$lines})) {
  233:         my $indinf = index($line, '<', $j);
  234:         if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
  235:           $i++;
  236:           $line = $lines->[$i];
  237:           $j = 0;
  238:           last;
  239:         }
  240:         my $indsup = index($line, '>', $j);
  241:         if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
  242:           my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
  243:           $test =~ s/^\s+|\s+$//g ;
  244:           if ($test eq '/'.$cde) {
  245:             $stop = 1;
  246:             $j = $indsup;
  247:           # this is commented because of markup like <display>&web(' ','','<p>')</display>
  248:           #} elsif ($test =~ /^[a-zA-Z\/]$/) {
  249:           #  $j = $indsup + 1;
  250:           } else {
  251:             $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
  252:             $lines->[$i] = $line;
  253:           }
  254:         } elsif ($indinf != -1 && $indsup == -1) {
  255:           $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
  256:           $lines->[$i] = $line;
  257:         } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
  258:           $line = substr($line, 0, $indsup).'&gt;'.substr($line, $indsup+1);
  259:           $lines->[$i] = $line;
  260:         } else {
  261:           $i++;
  262:           $line = $lines->[$i];
  263:           $j = 0;
  264:         }
  265:       }
  266:     }
  267:     $j++;
  268:     ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
  269:   }
  270: }
  271: 
  272: 
  273: ##
  274: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
  275: # @param {Array<string>} lines
  276: ##
  277: sub fix_html_entities {
  278:   my ($lines) = @_;
  279:   foreach my $line (@{$lines}) {
  280:     # html_to_xml is converting named entities before 255 (see HTML parser dtext)
  281:     # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
  282:     $line =~ s/&#128;|&#x80;/€/g;
  283:     $line =~ s/&#130;|&#x82;/‚/g;
  284:     $line =~ s/&#132;|&#x84;/„/g;
  285:     $line =~ s/&#133;|&#x85;/…/g;
  286:     $line =~ s/&#134;|&#x86;/†/g;
  287:     $line =~ s/&#135;|&#x87;/‡/g;
  288:     $line =~ s/&#136;|&#x88;/ˆ/g;
  289:     $line =~ s/&#137;|&#x89;/‰/g;
  290:     $line =~ s/&#139;|&#x8B;/‹/g;
  291:     $line =~ s/&#145;|&#x91;/‘/g;
  292:     $line =~ s/&#146;|&#x92;/’/g;
  293:     $line =~ s/&#147;|&#x93;/“/g;
  294:     $line =~ s/&#148;|&#x94;/”/g;
  295:     $line =~ s/&#149;|&#x95;/•/g;
  296:     $line =~ s/&#150;|&#x96;/–/g;
  297:     $line =~ s/&#151;|&#x97;/—/g;
  298:     $line =~ s/&#152;|&#x98;/˜/g;
  299:     $line =~ s/&#153;|&#x99;/™/g;
  300:     $line =~ s/&#155;|&#x9B;/›/g;
  301:     $line =~ s/&#156;|&#x9C;/œ/g;
  302:   }
  303: }
  304: 
  305: 
  306: # Tries to fix things like <font color="#990000" face="Verdana,>
  307: # without breaking <a b="c>d">
  308: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
  309: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
  310: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
  311: sub fix_missing_quotes {
  312:   my ($lines) = @_;
  313:   foreach my $line (@{$lines}) {
  314:     my $n_inf = $line =~ tr/<//;
  315:     my $n_sup = $line =~ tr/>//;
  316:     if ($n_inf == 1 && $n_sup == 1) {
  317:       my $ind_inf = index($line, '<');
  318:       my $ind_sup = index($line, '>');
  319:       if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
  320:         my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
  321:         if ($n_quotes % 2 != 0) {
  322:           # add a quote before > when there is an odd number of quotes inside <>
  323:           $line =~ s/>/">/;
  324:         }
  325:       }
  326:     }
  327:     $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
  328:     $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
  329:   }
  330: }
  331: 
  332: 
  333: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
  334: sub fix_empty_li {
  335:   my ($lines) = @_;
  336:   foreach my $line (@{$lines}) {
  337:     $line =~ s/<li\s?\/>/<li>/;
  338:   }
  339: }
  340: 
  341: 
  342: # remove doctypes, without assuming they are at the beginning
  343: sub remove_doctype {
  344:   my ($lines) = @_;
  345:   foreach my $line (@{$lines}) {
  346:     $line =~ s/<!DOCTYPE[^>]*>//;
  347:   }
  348: }
  349: 
  350: 
  351: # Adds a problem, library or html root element, enclosing things outside of the problem element.
  352: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
  353: sub add_root {
  354:   my ($lines, $filepath) = @_;
  355:   my $root_name;
  356:   if ($filepath =~ /\.library$/i) {
  357:     $root_name = 'library';
  358:   } elsif ($filepath =~ /\.html?$/i) {
  359:     $root_name = 'html';
  360:   } else {
  361:     $root_name = 'problem';
  362:   }
  363:   if ($root_name eq 'library') {
  364:     foreach my $line (@{$lines}) {
  365:       if ($line =~ /^\s*<[a-z]/) {
  366:         last;
  367:       }
  368:       if ($line !~ /^\s*$/) {
  369:         die "this library does not start with a tag, it might be a scriptlib";
  370:       }
  371:     }
  372:   }
  373:   my $line1 = $lines->[0];
  374:   $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
  375:   $line1 = "<$root_name>".$line1;
  376:   $lines->[0] = $line1;
  377:   $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
  378: }
  379: 
  380: 
  381: ##
  382: # Returns information about the next tag, starting at line number and char number.
  383: # Assumes the markup is well-formed and there is no CDATA,
  384: # which is not always true (like inside script), so results might be wrong sometimes.
  385: # It is however useful to avoid unnecessary changes in the document (using a parser to
  386: # do read/write for the whole document would mess up non well-formed documents).
  387: # @param {Array<string>} lines
  388: # @param {int} line_number - line number to start at
  389: # @param {int} char_number - char number to start at on the line
  390: # @returns (tag, type, line_number, char_number)
  391: ##
  392: sub next_tag {
  393:   my ($lines, $i, $j ) = @_;
  394:   my $i2 = $i;
  395:   my $j2 = $j;
  396:   while ($i2 < scalar(@{$lines})) {
  397:     my $line = $lines->[$i2];
  398:     $j2 = index($line, '<', $j2);
  399:     #TODO: handle comments
  400:     while ($j2 != -1) {
  401:       my $ind_slash = index($line, '/', $j2);
  402:       my $ind_sup = index($line, '>', $j2);
  403:       my $ind_space = index($line, ' ', $j2);
  404:       my $type;
  405:       my $tag;
  406:       if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
  407:         $type = 'end';
  408:         $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
  409:       } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
  410:         $type = 'empty';
  411:         if ($ind_space != -1 && $ind_space < $ind_sup) {
  412:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
  413:         } else {
  414:           $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
  415:         }
  416:       } elsif ($ind_sup != -1) {
  417:         $type = 'start';
  418:         if ($ind_space != -1 && $ind_space < $ind_sup) {
  419:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
  420:         } else {
  421:           $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
  422:         }
  423:       } else {
  424:         $tag = ''
  425:       }
  426:       if ($tag ne '') {
  427:         return ($tag, $type, $i2, $j2);
  428:       }
  429:       $j2 = index($line, '<', $j2 + 1);
  430:     }
  431:     $i2++;
  432:     $j2 = 0;
  433:   }
  434:   return ('', '', 0, 0);
  435: }
  436: 
  437: ##
  438: # Tests if a string is in an array, ignoring case
  439: ##
  440: sub in_array_ignore_case {
  441:   my ($array, $value) = @_;
  442:   my $lcvalue = lc($value);
  443:   foreach my $v (@{$array}) {
  444:     if (lc($v) eq $lcvalue) {
  445:       return 1;
  446:     }
  447:   }
  448:   return 0;
  449: }
  450: 
  451: 1;
  452: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>