Annotation of loncom/homework/cleanxml/pre_xml.pm, revision 1.1

1.1     ! damieng     1: # The LearningOnline Network
        !             2: # First step to clean a file.
        !             3: #
        !             4: # $Id$
        !             5: #
        !             6: # Copyright Michigan State University Board of Trustees
        !             7: #
        !             8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
        !             9: #
        !            10: # LON-CAPA is free software; you can redistribute it and/or modify
        !            11: # it under the terms of the GNU General Public License as published by
        !            12: # the Free Software Foundation; either version 2 of the License, or
        !            13: # (at your option) any later version.
        !            14: #
        !            15: # LON-CAPA is distributed in the hope that it will be useful,
        !            16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
        !            18: # GNU General Public License for more details.
        !            19: #
        !            20: # You should have received a copy of the GNU General Public License
        !            21: # along with LON-CAPA; if not, write to the Free Software
        !            22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
        !            23: #
        !            24: # /home/httpd/html/adm/gpl.txt
        !            25: #
        !            26: # http://www.lon-capa.org/
        !            27: #
        !            28: ###
        !            29: 
        !            30: #!/usr/bin/perl
        !            31: 
        !            32: package Apache::pre_xml;
        !            33: 
        !            34: use strict;
        !            35: use utf8;
        !            36: 
        !            37: use Encode;
        !            38: use Encode::Byte;
        !            39: use Encode::Guess;
        !            40: 
        !            41: # list of elements inside which < and > might not be turned into entities
        !            42: # unfortunately, answer can sometimes contain the elements vector and value...
        !            43: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
        !            44: 
        !            45: my $warnings; # 1 = print warnings
        !            46: 
        !            47: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
        !            48: # and returns the converted text.
        !            49: sub pre_xml {
        !            50:   my ($filepath, $warn) = @_;
        !            51:   $warnings = $warn;
        !            52:   
        !            53:   my $lines = guess_encoding_and_read($filepath);
        !            54: 
        !            55:   remove_control_characters($lines);
        !            56:   
        !            57:   fix_cdata_elements($lines);
        !            58: 
        !            59:   fix_html_entities($lines);
        !            60:   
        !            61:   fix_missing_quotes($lines);
        !            62:   
        !            63:   fix_empty_li($lines);
        !            64:   
        !            65:   remove_doctype($lines);
        !            66:   
        !            67:   add_root($lines, $filepath);
        !            68:   
        !            69:   return(\join('', @$lines));
        !            70: }
        !            71: 
        !            72: 
        !            73: ##
        !            74: # Tries to guess the character encoding, and returns the lines as decoded text.
        !            75: # Requires Encode::Byte.
        !            76: ##
        !            77: sub guess_encoding_and_read {
        !            78:   my ($fn) = @_;
        !            79:   no warnings "utf8";
        !            80:   local $/ = undef;
        !            81:   open(my $fh, "<", $fn) or die "cannot read $fn: $!";
        !            82:   binmode $fh;
        !            83:   my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
        !            84:   close $fh;
        !            85:   
        !            86:   if (index($data, '<') == -1) {
        !            87:     die "This file has no markup !";
        !            88:   }
        !            89:   
        !            90:   # try to get a charset from a meta at the beginning of the file
        !            91:   my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
        !            92:   if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
        !            93:     my $meta_charset = $1;
        !            94:     if ($meta_charset ne '') {
        !            95:       if ($meta_charset =~ /iso-?8859-?1/i) {
        !            96:         # usually a lie
        !            97:         $meta_charset = 'cp1252';
        !            98:       }
        !            99:       # now try to decode using that encoding
        !           100:       my $decoder = guess_encoding($data, ($meta_charset));
        !           101:       if (ref($decoder)) {
        !           102:         my $decoded = $decoder->decode($data);
        !           103:         my @lines = split(/^/m, $decoded);
        !           104:         return \@lines;
        !           105:       } else {
        !           106:         if ($warnings) {
        !           107:           print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
        !           108:         }
        !           109:       }
        !           110:     }
        !           111:   }
        !           112:   
        !           113:   my $decoded;
        !           114:   if (length($data) > 0) {
        !           115:     # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
        !           116:     #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
        !           117:     # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
        !           118:     my $decoder = Encode::Guess->guess($data);
        !           119:     if (ref($decoder)) {
        !           120:       $decoded = $decoder->decode($data);
        !           121:       # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
        !           122:     } else {
        !           123:       if ($warnings) {
        !           124:         print "Warning: encoding is not UTF-8 for $fn";
        !           125:       }
        !           126:       
        !           127:       # let's try iso-2022-jp first
        !           128:       $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
        !           129:       if (ref($decoder)) {
        !           130:         $decoded = $decoder->decode($data);
        !           131:         if ($warnings) {
        !           132:           print "; using iso-2022-jp\n";
        !           133:         }
        !           134:       } else {
        !           135:         # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
        !           136:         # instead of control codes. We can assume that these control codes are not used, so there
        !           137:         # is no need to test for iso-8859-1.
        !           138:         # The main problem here is to distinguish between cp1252 and MacRoman.
        !           139:         # see http://www.alanwood.net/demos/charsetdiffs.html#f
        !           140:         my $decoded_windows = decode('cp1252', $data);
        !           141:         my $decoded_mac = decode('MacRoman', $data);
        !           142:         # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
        !           143:         # í has been removed because it conflicts with ’ and ’ is more frequent
        !           144:         # ± has been removed because it is, suprisingly, the same code in both encodings !
        !           145:         my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
        !           146:         my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
        !           147:         # check newlines too (\r on MacOS < X, \r\n on Windows)
        !           148:         my $ind_cr = index($data, "\r");
        !           149:         if ($ind_cr != -1) {
        !           150:           if (substr($data, $ind_cr + 1, 1) eq "\n") {
        !           151:             $score_windows++;
        !           152:           } else {
        !           153:             $score_mac++;
        !           154:           }
        !           155:         }
        !           156:         if ($score_windows >= $score_mac) {
        !           157:           $decoded = $decoded_windows;
        !           158:           if ($warnings) {
        !           159:             print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
        !           160:           }
        !           161:         } else {
        !           162:           if ($warnings) {
        !           163:             print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
        !           164:           }
        !           165:           $decoded = $decoded_mac;
        !           166:         }
        !           167:       }
        !           168:     }
        !           169:   } else {
        !           170:     $decoded = '';
        !           171:   }
        !           172:   my @lines = split(/^/m, $decoded);
        !           173:   return \@lines;
        !           174: }
        !           175: 
        !           176: 
        !           177: ##
        !           178: # Removes some control characters
        !           179: # @param {Array<string>} lines
        !           180: ##
        !           181: sub remove_control_characters {
        !           182:   my ($lines) = @_;
        !           183:   foreach my $line (@{$lines}) {
        !           184:     $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
        !           185:     $line =~ s/&#[0-7];//g;
        !           186:     $line =~ s/&#1[4-9];//g;
        !           187:     $line =~ s/&#2[0-9];//g;
        !           188:   }
        !           189: }
        !           190: 
        !           191: ##
        !           192: # Replaces < and > characters by &lt; and &gt; in cdata elements (listed in @cdata_elements).
        !           193: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
        !           194: # @param {Array<string>} lines
        !           195: ##
        !           196: sub fix_cdata_elements {
        !           197:   my ($lines) = @_;
        !           198:   my $i = 0;
        !           199:   my $j = 0;
        !           200:   my $tag = '';
        !           201:   my $type;
        !           202:   my $in_numericalresponse = 0;
        !           203:   my $in_formularesponse = 0;
        !           204:   my $in_script = 0;
        !           205:   ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
        !           206:   while ($tag ne '') {
        !           207:     if ($tag eq 'numericalresponse') {
        !           208:       if ($type eq 'start') {
        !           209:         $in_numericalresponse = 1;
        !           210:       } else {
        !           211:         $in_numericalresponse = 0;
        !           212:       }
        !           213:     } elsif ($tag eq 'formularesponse') {
        !           214:       if ($type eq 'start') {
        !           215:         $in_formularesponse = 1;
        !           216:       } else {
        !           217:         $in_formularesponse = 0;
        !           218:       }
        !           219:     } elsif ($tag eq 'script') {
        !           220:       if ($type eq 'start') {
        !           221:         $in_script = 1;
        !           222:       } else {
        !           223:         $in_script = 0;
        !           224:       }
        !           225:     }
        !           226:     if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
        !           227:         ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
        !           228:       my $cde = $tag;
        !           229:       my $line = $lines->[$i];
        !           230:       $j = index($line, '>', $j+1) + 1;
        !           231:       my $stop = 0;
        !           232:       while (!$stop && $i < scalar(@{$lines})) {
        !           233:         my $indinf = index($line, '<', $j);
        !           234:         if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
        !           235:           $i++;
        !           236:           $line = $lines->[$i];
        !           237:           $j = 0;
        !           238:           last;
        !           239:         }
        !           240:         my $indsup = index($line, '>', $j);
        !           241:         if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
        !           242:           my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
        !           243:           $test =~ s/^\s+|\s+$//g ;
        !           244:           if ($test eq '/'.$cde) {
        !           245:             $stop = 1;
        !           246:             $j = $indsup;
        !           247:           # this is commented because of markup like <display>&web(' ','','<p>')</display>
        !           248:           #} elsif ($test =~ /^[a-zA-Z\/]$/) {
        !           249:           #  $j = $indsup + 1;
        !           250:           } else {
        !           251:             $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
        !           252:             $lines->[$i] = $line;
        !           253:           }
        !           254:         } elsif ($indinf != -1 && $indsup == -1) {
        !           255:           $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
        !           256:           $lines->[$i] = $line;
        !           257:         } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
        !           258:           $line = substr($line, 0, $indsup).'&gt;'.substr($line, $indsup+1);
        !           259:           $lines->[$i] = $line;
        !           260:         } else {
        !           261:           $i++;
        !           262:           $line = $lines->[$i];
        !           263:           $j = 0;
        !           264:         }
        !           265:       }
        !           266:     }
        !           267:     $j++;
        !           268:     ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
        !           269:   }
        !           270: }
        !           271: 
        !           272: 
        !           273: ##
        !           274: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
        !           275: # @param {Array<string>} lines
        !           276: ##
        !           277: sub fix_html_entities {
        !           278:   my ($lines) = @_;
        !           279:   foreach my $line (@{$lines}) {
        !           280:     # html_to_xml is converting named entities before 255 (see HTML parser dtext)
        !           281:     # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
        !           282:     $line =~ s/&#128;|&#x80;/€/g;
        !           283:     $line =~ s/&#130;|&#x82;/‚/g;
        !           284:     $line =~ s/&#132;|&#x84;/„/g;
        !           285:     $line =~ s/&#133;|&#x85;/…/g;
        !           286:     $line =~ s/&#134;|&#x86;/†/g;
        !           287:     $line =~ s/&#135;|&#x87;/‡/g;
        !           288:     $line =~ s/&#136;|&#x88;/ˆ/g;
        !           289:     $line =~ s/&#137;|&#x89;/‰/g;
        !           290:     $line =~ s/&#139;|&#x8B;/‹/g;
        !           291:     $line =~ s/&#145;|&#x91;/‘/g;
        !           292:     $line =~ s/&#146;|&#x92;/’/g;
        !           293:     $line =~ s/&#147;|&#x93;/“/g;
        !           294:     $line =~ s/&#148;|&#x94;/”/g;
        !           295:     $line =~ s/&#149;|&#x95;/•/g;
        !           296:     $line =~ s/&#150;|&#x96;/–/g;
        !           297:     $line =~ s/&#151;|&#x97;/—/g;
        !           298:     $line =~ s/&#152;|&#x98;/˜/g;
        !           299:     $line =~ s/&#153;|&#x99;/™/g;
        !           300:     $line =~ s/&#155;|&#x9B;/›/g;
        !           301:     $line =~ s/&#156;|&#x9C;/œ/g;
        !           302:   }
        !           303: }
        !           304: 
        !           305: 
        !           306: # Tries to fix things like <font color="#990000" face="Verdana,>
        !           307: # without breaking <a b="c>d">
        !           308: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
        !           309: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
        !           310: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
        !           311: sub fix_missing_quotes {
        !           312:   my ($lines) = @_;
        !           313:   foreach my $line (@{$lines}) {
        !           314:     my $n_inf = $line =~ tr/<//;
        !           315:     my $n_sup = $line =~ tr/>//;
        !           316:     if ($n_inf == 1 && $n_sup == 1) {
        !           317:       my $ind_inf = index($line, '<');
        !           318:       my $ind_sup = index($line, '>');
        !           319:       if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
        !           320:         my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
        !           321:         if ($n_quotes % 2 != 0) {
        !           322:           # add a quote before > when there is an odd number of quotes inside <>
        !           323:           $line =~ s/>/">/;
        !           324:         }
        !           325:       }
        !           326:     }
        !           327:     $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
        !           328:     $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
        !           329:   }
        !           330: }
        !           331: 
        !           332: 
        !           333: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
        !           334: sub fix_empty_li {
        !           335:   my ($lines) = @_;
        !           336:   foreach my $line (@{$lines}) {
        !           337:     $line =~ s/<li\s?\/>/<li>/;
        !           338:   }
        !           339: }
        !           340: 
        !           341: 
        !           342: # remove doctypes, without assuming they are at the beginning
        !           343: sub remove_doctype {
        !           344:   my ($lines) = @_;
        !           345:   foreach my $line (@{$lines}) {
        !           346:     $line =~ s/<!DOCTYPE[^>]*>//;
        !           347:   }
        !           348: }
        !           349: 
        !           350: 
        !           351: # Adds a problem, library or html root element, enclosing things outside of the problem element.
        !           352: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
        !           353: sub add_root {
        !           354:   my ($lines, $filepath) = @_;
        !           355:   my $root_name;
        !           356:   if ($filepath =~ /\.library$/i) {
        !           357:     $root_name = 'library';
        !           358:   } elsif ($filepath =~ /\.html?$/i) {
        !           359:     $root_name = 'html';
        !           360:   } else {
        !           361:     $root_name = 'problem';
        !           362:   }
        !           363:   if ($root_name eq 'library') {
        !           364:     foreach my $line (@{$lines}) {
        !           365:       if ($line =~ /^\s*<[a-z]/) {
        !           366:         last;
        !           367:       }
        !           368:       if ($line !~ /^\s*$/) {
        !           369:         die "this library does not start with a tag, it might be a scriptlib";
        !           370:       }
        !           371:     }
        !           372:   }
        !           373:   my $line1 = $lines->[0];
        !           374:   $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
        !           375:   $line1 = "<$root_name>".$line1;
        !           376:   $lines->[0] = $line1;
        !           377:   $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
        !           378: }
        !           379: 
        !           380: 
        !           381: ##
        !           382: # Returns information about the next tag, starting at line number and char number.
        !           383: # Assumes the markup is well-formed and there is no CDATA,
        !           384: # which is not always true (like inside script), so results might be wrong sometimes.
        !           385: # It is however useful to avoid unnecessary changes in the document (using a parser to
        !           386: # do read/write for the whole document would mess up non well-formed documents).
        !           387: # @param {Array<string>} lines
        !           388: # @param {int} line_number - line number to start at
        !           389: # @param {int} char_number - char number to start at on the line
        !           390: # @returns (tag, type, line_number, char_number)
        !           391: ##
        !           392: sub next_tag {
        !           393:   my ($lines, $i, $j ) = @_;
        !           394:   my $i2 = $i;
        !           395:   my $j2 = $j;
        !           396:   while ($i2 < scalar(@{$lines})) {
        !           397:     my $line = $lines->[$i2];
        !           398:     $j2 = index($line, '<', $j2);
        !           399:     #TODO: handle comments
        !           400:     while ($j2 != -1) {
        !           401:       my $ind_slash = index($line, '/', $j2);
        !           402:       my $ind_sup = index($line, '>', $j2);
        !           403:       my $ind_space = index($line, ' ', $j2);
        !           404:       my $type;
        !           405:       my $tag;
        !           406:       if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
        !           407:         $type = 'end';
        !           408:         $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
        !           409:       } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
        !           410:         $type = 'empty';
        !           411:         if ($ind_space != -1 && $ind_space < $ind_sup) {
        !           412:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
        !           413:         } else {
        !           414:           $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
        !           415:         }
        !           416:       } elsif ($ind_sup != -1) {
        !           417:         $type = 'start';
        !           418:         if ($ind_space != -1 && $ind_space < $ind_sup) {
        !           419:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
        !           420:         } else {
        !           421:           $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
        !           422:         }
        !           423:       } else {
        !           424:         $tag = ''
        !           425:       }
        !           426:       if ($tag ne '') {
        !           427:         return ($tag, $type, $i2, $j2);
        !           428:       }
        !           429:       $j2 = index($line, '<', $j2 + 1);
        !           430:     }
        !           431:     $i2++;
        !           432:     $j2 = 0;
        !           433:   }
        !           434:   return ('', '', 0, 0);
        !           435: }
        !           436: 
        !           437: ##
        !           438: # Tests if a string is in an array, ignoring case
        !           439: ##
        !           440: sub in_array_ignore_case {
        !           441:   my ($array, $value) = @_;
        !           442:   my $lcvalue = lc($value);
        !           443:   foreach my $v (@{$array}) {
        !           444:     if (lc($v) eq $lcvalue) {
        !           445:       return 1;
        !           446:     }
        !           447:   }
        !           448:   return 0;
        !           449: }
        !           450: 
        !           451: 1;
        !           452: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>