loncom/homework/cleanxml/html_to_xml.pm - view

File: [LON-CAPA] / loncom / homework / cleanxml / html_to_xml.pm
Revision 1.2: download - view: text, annotated - select for diffs
Wed Jan 6 16:44:32 2016 UTC (9 years, 6 months ago) by damieng
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_6_msu, version_2_11_5_msu, version_2_11_4_msu, HEAD

Added Daxe editing for HTML documents, fixed issues with case mixing in task documents

1: # The LearningOnline Network 2: # Second step to clean a file. 3: # 4: # $Id: html_to_xml.pm,v 1.2 2016/01/06 16:44:32 damieng Exp $ 5: # 6: # Copyright Michigan State University Board of Trustees 7: # 8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA). 9: # 10: # LON-CAPA is free software; you can redistribute it and/or modify 11: # it under the terms of the GNU General Public License as published by 12: # the Free Software Foundation; either version 2 of the License, or 13: # (at your option) any later version. 14: # 15: # LON-CAPA is distributed in the hope that it will be useful, 16: # but WITHOUT ANY WARRANTY; without even the implied warranty of 17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18: # GNU General Public License for more details. 19: # 20: # You should have received a copy of the GNU General Public License 21: # along with LON-CAPA; if not, write to the Free Software 22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23: # 24: # /home/httpd/html/adm/gpl.txt 25: # 26: # http://www.lon-capa.org/ 27: # 28: ### 29: 30: #!/usr/bin/perl 31: 32: 33: package Apache::html_to_xml; 34: 35: use strict; 36: use utf8; 37: use warnings; 38: use HTML::Parser (); 39: 40: # always closing, end tags are ignored: 41: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext'); 42: 43: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe'); 44: 45: 46: my $result; 47: my @stack; 48: my $close_warning; 49: my $warnings; # 1 = print warnings 50: 51: 52: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML. 53: sub html_to_xml { 54: my($textref, $warn, $case_sensitive) = @_; 55: $warnings = $warn; 56: if (!defined $case_sensitive) { 57: $case_sensitive = 0; 58: } 59: $result = ''; 60: @stack = (); 61: $close_warning = ''; 62: my $p = HTML::Parser->new( api_version => 3, 63: start_h => [\&start, "tagname, attr, attrseq"], 64: end_h => [\&end, "tagname"], 65: text_h => [\&text, "dtext"], 66: comment_h => [\&comment, "tokens"], 67: declaration_h => [\&declaration, "tokens"], 68: process_h => [\&process, "token0"], 69: ); 70: # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase 71: # This is a problem with the Task elements, so it is disabled in that case 72: if ($case_sensitive) { 73: $p->case_sensitive(1); 74: } 75: $p->empty_element_tags(1); 76: $result .= "<?xml version='1.0' encoding='UTF-8'?>\n"; 77: $p->parse($$textref); 78: for (my $i=scalar(@stack)-1; $i>=0; $i--) { 79: if ($close_warning ne '') { 80: $close_warning .= ', '; 81: } 82: $close_warning .= $stack[$i]; 83: $result .= '</'.$stack[$i].'>'; 84: } 85: if ($warnings && $close_warning ne '') { 86: print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n"; 87: } 88: return \$result; 89: } 90: 91: sub start { 92: my($tagname, $attr, $attrseq) = @_; 93: 94: # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files 95: # (in problem files there are not so many missing tags) 96: # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser 97: 98: if ($tagname eq 'o:p') { 99: return; 100: } 101: 102: if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname> 103: $result .= "<$tagname>"; 104: return; 105: } 106: 107: #$tagname = lc($tagname); this is done by default by the parser 108: $tagname = fix_tag($tagname); 109: if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' && 110: !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) { 111: # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse 112: # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><TD... 113: # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is) 114: if ($warnings) { 115: print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n"; 116: } 117: start('td'); 118: } 119: if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') { 120: end('p'); 121: } elsif ($tagname eq 'li') { 122: my $ind_li = last_index_of(\@stack, 'li'); 123: my $ind_ul = last_index_of(\@stack, 'ul'); 124: my $ind_ol = last_index_of(\@stack, 'ol'); 125: if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) { 126: end('li'); 127: } 128: } elsif ($tagname eq 'tr') { 129: my $ind_table = last_index_of(\@stack, 'table'); 130: my $ind_tr = last_index_of(\@stack, 'tr'); 131: if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) { 132: end('tr'); 133: } 134: } elsif ($tagname eq 'td' || $tagname eq 'th') { 135: my $ind_table = last_index_of(\@stack, 'table'); 136: my $ind_td = last_index_of(\@stack, 'td'); 137: my $ind_th = last_index_of(\@stack, 'th'); 138: my $ind_tr = last_index_of(\@stack, 'tr'); 139: if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) { 140: start('tr'); 141: $ind_tr = last_index_of(\@stack, 'tr'); 142: } 143: if ($ind_td != -1 && $ind_tr < $ind_td) { 144: end('td'); 145: } elsif ($ind_th != -1 && $ind_tr < $ind_th) { 146: end('th'); 147: } 148: } elsif ($tagname eq 'dd' || $tagname eq 'dt') { 149: my $ind_dd = last_index_of(\@stack, 'dd'); 150: my $ind_dt = last_index_of(\@stack, 'dt'); 151: my $ind_dl = last_index_of(\@stack, 'dl'); 152: if ($ind_dl == -1) { 153: start('dl'); 154: $ind_dl = last_index_of(\@stack, 'dl'); 155: } 156: if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) { 157: end('dd'); 158: } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) { 159: end('dt'); 160: } 161: } elsif ($tagname eq 'option') { 162: my $ind_option = last_index_of(\@stack, 'option'); 163: if ($ind_option != -1) { 164: end('option'); 165: } 166: } elsif ($tagname eq 'area') { 167: my $ind_area = last_index_of(\@stack, 'area'); 168: if ($ind_area != -1) { 169: end('area'); 170: } 171: } elsif ($tagname eq 'a') { 172: my $ind_a = last_index_of(\@stack, 'a'); 173: if ($ind_a != -1) { 174: end('a'); 175: } 176: } elsif ($tagname eq 'num') { 177: my $ind_num = last_index_of(\@stack, 'num'); 178: if ($ind_num != -1) { 179: end('num'); 180: } 181: } 182: 183: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway). 184: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead. 185: # Paragraphs inside paragraphs will be fixed later. 186: 187: # my @styles = (); 188: # if ($tagname eq 'p') { 189: # for (my $i=scalar(@stack)-1; $i>=0; $i--) { 190: # if ($stack[$i] eq 'p') { 191: # # save the styles 192: # for (my $j=$i+1; $j<scalar(@stack); $j++) { 193: # if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) { 194: # push(@styles, $stack[$j]); 195: # } 196: # } 197: # # close the p 198: # end('p'); 199: # last; 200: # } elsif (index_of(\@block_html, $stack[$i]) != -1) { 201: # # stop looking 202: # last; 203: # } 204: # } 205: # } 206: $result .= '<'.$tagname; 207: my %seen = (); 208: foreach my $att_name (@$attrseq) { 209: my $att_name_modified = $att_name; 210: $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g; 211: $att_name_modified =~ s/^[\-.0-9]*//; 212: if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) { 213: if ($seen{$att_name_modified}) { 214: if ($warnings) { 215: print "Warning: Ignoring duplicate attribute: $att_name\n"; 216: } 217: next; 218: } 219: $seen{$att_name_modified}++; 220: my $att_value = $attr->{$att_name}; 221: $att_value =~ s/^[“”]|[“”]$//g; 222: $att_value =~ s/&/&/g; 223: $att_value =~ s/</</g; 224: $att_value =~ s/>/>/g; 225: $att_value =~ s/"/"/g; 226: if ($tagname eq 'embed' && $att_name_modified eq 'script') { 227: # newlines are encoded to preserve Protein Explorer scripts in embed script attributes: 228: $att_value =~ s/\x0A/
/g; 229: $att_value =~ s/\x0D//g; 230: } 231: if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' || 232: $att_value eq 'http://www.w3.org/TR/REC-html40')) { 233: next; 234: } 235: $result .= ' '.$att_name_modified.'="'.$att_value.'"'; 236: } 237: } 238: if (index_of(\@empty, $tagname) != -1) { 239: $result .= '/>'; 240: } else { 241: $result .= '>'; 242: push(@stack, $tagname); 243: if (scalar(@stack) > 500) { 244: die "This document has a crazy depth - I'm out !"; 245: } 246: } 247: # reopen the styles, if any 248: #for (my $j=0; $j<scalar(@styles); $j++) { 249: # start($styles[$j], {}, ()); 250: #} 251: } 252: 253: sub end { 254: my($tagname) = @_; 255: 256: if ($tagname eq 'o:p') { 257: return; 258: } 259: 260: $tagname = fix_tag($tagname); 261: if (index_of(\@empty, $tagname) != -1) { 262: return; 263: } 264: if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') { 265: # handle <th>text</td> as if it was <th>text</th> 266: $tagname = 'th'; 267: } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') { 268: # handle <td>text</th> as if it was <td>text</td> 269: $tagname = 'td'; 270: } 271: my $found = 0; 272: for (my $i=scalar(@stack)-1; $i>=0; $i--) { 273: if ($stack[$i] eq $tagname) { 274: for (my $j=scalar(@stack)-1; $j>$i; $j--) { 275: if ($close_warning ne '') { 276: $close_warning .= ', '; 277: } 278: $close_warning .= $stack[$j]; 279: $result .= '</'.$stack[$j].'>'; 280: } 281: splice(@stack, $i, scalar(@stack)-$i); 282: $found = 1; 283: last; 284: } elsif (index_of(\@stack, 'web') != -1) { 285: die "There is a web element with missing end tags inside - it has to be fixed by hand"; 286: } 287: } 288: if ($found) { 289: $result .= '</'.$tagname.'>'; 290: } elsif ($tagname eq 'p') { 291: $result .= ''; 292: } 293: } 294: 295: sub text { 296: my($dtext) = @_; 297: $dtext =~ s/&/&/g; 298: $dtext =~ s/</</g; 299: $dtext =~ s/>/>/g; 300: $dtext =~ s/"/"/g; 301: $result .= $dtext; 302: } 303: 304: sub comment { 305: my($tokens) = @_; 306: # NOTE: the HTML parser thinks this is a comment: 307: # and LON-CAPA has sometimes turned that into <![CDATA[]]> 308: foreach my $comment (@$tokens) { 309: $comment =~ s/--/- /g; 310: $comment =~ s/^-|-$/ /g; 311: $result .= ''; 312: } 313: } 314: 315: sub declaration { 316: my($tokens) = @_; 317: # ignore them 318: #$result .= '<!'; 319: #$result .= join(' ', @$tokens); 320: #$result .= '>'; 321: } 322: 323: sub process { 324: my($token0) = @_; 325: if ($token0 ne '') { 326: $result .= '<?'.$token0.'>'; 327: } 328: } 329: 330: sub index_of { 331: my ($array, $value) = @_; 332: for (my $i=0; $i<scalar(@{$array}); $i++) { 333: if ($array->[$i] eq $value) { 334: return $i; 335: } 336: } 337: return -1; 338: } 339: 340: sub last_index_of { 341: my ($array, $value) = @_; 342: for (my $i=scalar(@{$array})-1; $i>=0; $i--) { 343: if ($array->[$i] eq $value) { 344: return $i; 345: } 346: } 347: return -1; 348: } 349: 350: sub fix_tag { 351: my ($tag) = @_; 352: #$tag = lc($tag); this is done by default by the parser 353: if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) { 354: if ($warnings) { 355: print "Warning: bad start tag:'".$tag."'"; 356: } 357: if ($tag =~ /<[a-zA-Z]/) { 358: $tag =~ s/^[^<]*<//; # a b 359: } 360: if ($tag =~ /[a-zA-Z]=/) { 361: $tag =~ s/=.*$//; # a=b -> a 362: } 363: if ($tag =~ /[a-zA-Z]\//) { 364: $tag =~ s/\/.*$//; # a/b -> a 365: } 366: if ($tag =~ /:/) { 367: # a:b -> b except when : at the end 368: if ($tag =~ /^[^:]*:$/) { 369: $tag =~ s/://; 370: } else { 371: $tag =~ s/^.*://; 372: } 373: } 374: $tag =~ s/^[0-9\-\.]+//; 375: $tag =~ s/[^a-zA-Z0-9_\-\.]//g; 376: if ($warnings) { 377: print " (converted to $tag)\n"; 378: } 379: } 380: return($tag); 381: } 382: 383: 384: ## 385: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array) 386: # @param {Array<string>} array - reference to the array of strings 387: # @param {string} value - the string to look for 388: # @returns 1 if found, 0 otherwise 389: ## 390: sub string_in_array { 391: my ($array, $value) = @_; 392: foreach my $v (@{$array}) { 393: if ($v eq $value) { 394: return 1; 395: } 396: } 397: return 0; 398: } 399: 400: 401: 1; 402: __END__