loncom/homework/cleanxml/pre_xml.pm - view

File: [LON-CAPA] / loncom / homework / cleanxml / pre_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Thu Dec 3 20:40:31 2015 UTC (8 years, 4 months ago) by damieng
Branches: MAIN
CVS tags: HEAD

integrated Daxe, opening in a separate window for now

1: # The LearningOnline Network 2: # First step to clean a file. 3: # 4: # $Id: pre_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $ 5: # 6: # Copyright Michigan State University Board of Trustees 7: # 8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA). 9: # 10: # LON-CAPA is free software; you can redistribute it and/or modify 11: # it under the terms of the GNU General Public License as published by 12: # the Free Software Foundation; either version 2 of the License, or 13: # (at your option) any later version. 14: # 15: # LON-CAPA is distributed in the hope that it will be useful, 16: # but WITHOUT ANY WARRANTY; without even the implied warranty of 17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18: # GNU General Public License for more details. 19: # 20: # You should have received a copy of the GNU General Public License 21: # along with LON-CAPA; if not, write to the Free Software 22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23: # 24: # /home/httpd/html/adm/gpl.txt 25: # 26: # http://www.lon-capa.org/ 27: # 28: ### 29: 30: #!/usr/bin/perl 31: 32: package Apache::pre_xml; 33: 34: use strict; 35: use utf8; 36: 37: use Encode; 38: use Encode::Byte; 39: use Encode::Guess; 40: 41: # list of elements inside which < and > might not be turned into entities 42: # unfortunately, answer can sometimes contain the elements vector and value... 43: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it 44: 45: my $warnings; # 1 = print warnings 46: 47: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities, 48: # and returns the converted text. 49: sub pre_xml { 50: my ($filepath, $warn) = @_; 51: $warnings = $warn; 52: 53: my $lines = guess_encoding_and_read($filepath); 54: 55: remove_control_characters($lines); 56: 57: fix_cdata_elements($lines); 58: 59: fix_html_entities($lines); 60: 61: fix_missing_quotes($lines); 62: 63: fix_empty_li($lines); 64: 65: remove_doctype($lines); 66: 67: add_root($lines, $filepath); 68: 69: return(\join('', @$lines)); 70: } 71: 72: 73: ## 74: # Tries to guess the character encoding, and returns the lines as decoded text. 75: # Requires Encode::Byte. 76: ## 77: sub guess_encoding_and_read { 78: my ($fn) = @_; 79: no warnings "utf8"; 80: local $/ = undef; 81: open(my $fh, "<", $fn) or die "cannot read $fn: $!"; 82: binmode $fh; 83: my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element 84: close $fh; 85: 86: if (index($data, '<') == -1) { 87: die "This file has no markup !"; 88: } 89: 90: # try to get a charset from a meta at the beginning of the file 91: my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half 92: if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) { 93: my $meta_charset = $1; 94: if ($meta_charset ne '') { 95: if ($meta_charset =~ /iso-?8859-?1/i) { 96: # usually a lie 97: $meta_charset = 'cp1252'; 98: } 99: # now try to decode using that encoding 100: my $decoder = guess_encoding($data, ($meta_charset)); 101: if (ref($decoder)) { 102: my $decoded = $decoder->decode($data); 103: my @lines = split(/^/m, $decoded); 104: return \@lines; 105: } else { 106: if ($warnings) { 107: print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n"; 108: } 109: } 110: } 111: } 112: 113: my $decoded; 114: if (length($data) > 0) { 115: # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess 116: #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/); 117: # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM 118: my $decoder = Encode::Guess->guess($data); 119: if (ref($decoder)) { 120: $decoded = $decoder->decode($data); 121: # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8) 122: } else { 123: if ($warnings) { 124: print "Warning: encoding is not UTF-8 for $fn"; 125: } 126: 127: # let's try iso-2022-jp first 128: $decoder = Encode::Guess->guess($data, 'iso-2022-jp'); 129: if (ref($decoder)) { 130: $decoded = $decoder->decode($data); 131: if ($warnings) { 132: print "; using iso-2022-jp\n"; 133: } 134: } else { 135: # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159 136: # instead of control codes. We can assume that these control codes are not used, so there 137: # is no need to test for iso-8859-1. 138: # The main problem here is to distinguish between cp1252 and MacRoman. 139: # see http://www.alanwood.net/demos/charsetdiffs.html#f 140: my $decoded_windows = decode('cp1252', $data); 141: my $decoded_mac = decode('MacRoman', $data); 142: # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese) 143: # í has been removed because it conflicts with ’ and ’ is more frequent 144: # ± has been removed because it is, suprisingly, the same code in both encodings ! 145: my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//; 146: my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//; 147: # check newlines too (\r on MacOS < X, \r\n on Windows) 148: my $ind_cr = index($data, "\r"); 149: if ($ind_cr != -1) { 150: if (substr($data, $ind_cr + 1, 1) eq "\n") { 151: $score_windows++; 152: } else { 153: $score_mac++; 154: } 155: } 156: if ($score_windows >= $score_mac) { 157: $decoded = $decoded_windows; 158: if ($warnings) { 159: print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n"; 160: } 161: } else { 162: if ($warnings) { 163: print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n"; 164: } 165: $decoded = $decoded_mac; 166: } 167: } 168: } 169: } else { 170: $decoded = ''; 171: } 172: my @lines = split(/^/m, $decoded); 173: return \@lines; 174: } 175: 176: 177: ## 178: # Removes some control characters 179: # @param {Array<string>} lines 180: ## 181: sub remove_control_characters { 182: my ($lines) = @_; 183: foreach my $line (@{$lines}) { 184: $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g; 185: $line =~ s/&#[0-7];//g; 186: $line =~ s/&#1[4-9];//g; 187: $line =~ s/&#2[0-9];//g; 188: } 189: } 190: 191: ## 192: # Replaces < and > characters by < and > in cdata elements (listed in @cdata_elements). 193: # EXCEPT for answer when it's inside numericalresponse or formularesponse. 194: # @param {Array<string>} lines 195: ## 196: sub fix_cdata_elements { 197: my ($lines) = @_; 198: my $i = 0; 199: my $j = 0; 200: my $tag = ''; 201: my $type; 202: my $in_numericalresponse = 0; 203: my $in_formularesponse = 0; 204: my $in_script = 0; 205: ($tag, $type, $i, $j) = next_tag($lines, $i, $j); 206: while ($tag ne '') { 207: if ($tag eq 'numericalresponse') { 208: if ($type eq 'start') { 209: $in_numericalresponse = 1; 210: } else { 211: $in_numericalresponse = 0; 212: } 213: } elsif ($tag eq 'formularesponse') { 214: if ($type eq 'start') { 215: $in_formularesponse = 1; 216: } else { 217: $in_formularesponse = 0; 218: } 219: } elsif ($tag eq 'script') { 220: if ($type eq 'start') { 221: $in_script = 1; 222: } else { 223: $in_script = 0; 224: } 225: } 226: if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script && 227: ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) { 228: my $cde = $tag; 229: my $line = $lines->[$i]; 230: $j = index($line, '>', $j+1) + 1; 231: my $stop = 0; 232: while (!$stop && $i < scalar(@{$lines})) { 233: my $indinf = index($line, '<', $j); 234: if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) { 235: $i++; 236: $line = $lines->[$i]; 237: $j = 0; 238: last; 239: } 240: my $indsup = index($line, '>', $j); 241: if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) { 242: my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1)); 243: $test =~ s/^\s+|\s+$//g ; 244: if ($test eq '/'.$cde) { 245: $stop = 1; 246: $j = $indsup; 247: # this is commented because of markup like <display>&web(' ','','<p>')</display> 248: #} elsif ($test =~ /^[a-zA-Z\/]$/) { 249: # $j = $indsup + 1; 250: } else { 251: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1); 252: $lines->[$i] = $line; 253: } 254: } elsif ($indinf != -1 && $indsup == -1) { 255: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1); 256: $lines->[$i] = $line; 257: } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) { 258: $line = substr($line, 0, $indsup).'>'.substr($line, $indsup+1); 259: $lines->[$i] = $line; 260: } else { 261: $i++; 262: $line = $lines->[$i]; 263: $j = 0; 264: } 265: } 266: } 267: $j++; 268: ($tag, $type, $i, $j) = next_tag($lines, $i, $j); 269: } 270: } 271: 272: 273: ## 274: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML). 275: # @param {Array<string>} lines 276: ## 277: sub fix_html_entities { 278: my ($lines) = @_; 279: foreach my $line (@{$lines}) { 280: # html_to_xml is converting named entities before 255 (see HTML parser dtext) 281: # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255): 282: $line =~ s/|/€/g; 283: $line =~ s/|/‚/g; 284: $line =~ s/|/„/g; 285: $line =~ s/|/…/g; 286: $line =~ s/|/†/g; 287: $line =~ s/|/‡/g; 288: $line =~ s/|/ˆ/g; 289: $line =~ s/|/‰/g; 290: $line =~ s/|/‹/g; 291: $line =~ s/|/‘/g; 292: $line =~ s/|/’/g; 293: $line =~ s/|/“/g; 294: $line =~ s/|/”/g; 295: $line =~ s/|/•/g; 296: $line =~ s/|/–/g; 297: $line =~ s/|/—/g; 298: $line =~ s/|/˜/g; 299: $line =~ s/|/™/g; 300: $line =~ s/|/›/g; 301: $line =~ s/|/œ/g; 302: } 303: } 304: 305: 306: # Tries to fix things like <font color="#990000" face="Verdana,> 307: # without breaking <a b="c>d"> 308: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case). 309: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" , 310: # and (no markup before)<a b="c> (no quote after) into <a b="c"> . 311: sub fix_missing_quotes { 312: my ($lines) = @_; 313: foreach my $line (@{$lines}) { 314: my $n_inf = $line =~ tr/<//; 315: my $n_sup = $line =~ tr/>//; 316: if ($n_inf == 1 && $n_sup == 1) { 317: my $ind_inf = index($line, '<'); 318: my $ind_sup = index($line, '>'); 319: if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) { 320: my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//; 321: if ($n_quotes % 2 != 0) { 322: # add a quote before > when there is an odd number of quotes inside <> 323: $line =~ s/>/">/; 324: } 325: } 326: } 327: $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/; 328: $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/; 329: } 330: } 331: 332: 333: # Replaces <li/> by <li> (the end tag will be added in html_to_xml 334: sub fix_empty_li { 335: my ($lines) = @_; 336: foreach my $line (@{$lines}) { 337: $line =~ s/<li\s?\/>/<li>/; 338: } 339: } 340: 341: 342: # remove doctypes, without assuming they are at the beginning 343: sub remove_doctype { 344: my ($lines) = @_; 345: foreach my $line (@{$lines}) { 346: $line =~ s/<!DOCTYPE[^>]*>//; 347: } 348: } 349: 350: 351: # Adds a problem, library or html root element, enclosing things outside of the problem element. 352: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing). 353: sub add_root { 354: my ($lines, $filepath) = @_; 355: my $root_name; 356: if ($filepath =~ /\.library$/i) { 357: $root_name = 'library'; 358: } elsif ($filepath =~ /\.html?$/i) { 359: $root_name = 'html'; 360: } else { 361: $root_name = 'problem'; 362: } 363: if ($root_name eq 'library') { 364: foreach my $line (@{$lines}) { 365: if ($line =~ /^\s*<[a-z]/) { 366: last; 367: } 368: if ($line !~ /^\s*$/) { 369: die "this library does not start with a tag, it might be a scriptlib"; 370: } 371: } 372: } 373: my $line1 = $lines->[0]; 374: $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway 375: $line1 = "<$root_name>".$line1; 376: $lines->[0] = $line1; 377: $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>"; 378: } 379: 380: 381: ## 382: # Returns information about the next tag, starting at line number and char number. 383: # Assumes the markup is well-formed and there is no CDATA, 384: # which is not always true (like inside script), so results might be wrong sometimes. 385: # It is however useful to avoid unnecessary changes in the document (using a parser to 386: # do read/write for the whole document would mess up non well-formed documents). 387: # @param {Array<string>} lines 388: # @param {int} line_number - line number to start at 389: # @param {int} char_number - char number to start at on the line 390: # @returns (tag, type, line_number, char_number) 391: ## 392: sub next_tag { 393: my ($lines, $i, $j ) = @_; 394: my $i2 = $i; 395: my $j2 = $j; 396: while ($i2 < scalar(@{$lines})) { 397: my $line = $lines->[$i2]; 398: $j2 = index($line, '<', $j2); 399: #TODO: handle comments 400: while ($j2 != -1) { 401: my $ind_slash = index($line, '/', $j2); 402: my $ind_sup = index($line, '>', $j2); 403: my $ind_space = index($line, ' ', $j2); 404: my $type; 405: my $tag; 406: if ($ind_slash == $j2 + 1 && $ind_sup != -1) { 407: $type = 'end'; 408: $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2)); 409: } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) { 410: $type = 'empty'; 411: if ($ind_space != -1 && $ind_space < $ind_sup) { 412: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1)); 413: } else { 414: $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1)); 415: } 416: } elsif ($ind_sup != -1) { 417: $type = 'start'; 418: if ($ind_space != -1 && $ind_space < $ind_sup) { 419: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1)); 420: } else { 421: $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1)); 422: } 423: } else { 424: $tag = '' 425: } 426: if ($tag ne '') { 427: return ($tag, $type, $i2, $j2); 428: } 429: $j2 = index($line, '<', $j2 + 1); 430: } 431: $i2++; 432: $j2 = 0; 433: } 434: return ('', '', 0, 0); 435: } 436: 437: ## 438: # Tests if a string is in an array, ignoring case 439: ## 440: sub in_array_ignore_case { 441: my ($array, $value) = @_; 442: my $lcvalue = lc($value); 443: foreach my $v (@{$array}) { 444: if (lc($v) eq $lcvalue) { 445: return 1; 446: } 447: } 448: return 0; 449: } 450: 451: 1; 452: __END__