loncom/homework/cleanxml/html_to_xml.pm - view

File: [LON-CAPA] / loncom / homework / cleanxml / html_to_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Thu Dec 3 20:40:31 2015 UTC (9 years, 7 months ago) by damieng
Branches: MAIN
CVS tags: HEAD

integrated Daxe, opening in a separate window for now

1: # The LearningOnline Network 2: # Second step to clean a file. 3: # 4: # $Id: html_to_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $ 5: # 6: # Copyright Michigan State University Board of Trustees 7: # 8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA). 9: # 10: # LON-CAPA is free software; you can redistribute it and/or modify 11: # it under the terms of the GNU General Public License as published by 12: # the Free Software Foundation; either version 2 of the License, or 13: # (at your option) any later version. 14: # 15: # LON-CAPA is distributed in the hope that it will be useful, 16: # but WITHOUT ANY WARRANTY; without even the implied warranty of 17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18: # GNU General Public License for more details. 19: # 20: # You should have received a copy of the GNU General Public License 21: # along with LON-CAPA; if not, write to the Free Software 22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23: # 24: # /home/httpd/html/adm/gpl.txt 25: # 26: # http://www.lon-capa.org/ 27: # 28: ### 29: 30: #!/usr/bin/perl 31: 32: 33: package Apache::html_to_xml; 34: 35: use strict; 36: use utf8; 37: use warnings; 38: use HTML::Parser (); 39: 40: # always closing, end tags are ignored: 41: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext'); 42: 43: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe'); 44: 45: 46: my $result; 47: my @stack; 48: my $close_warning; 49: my $warnings; # 1 = print warnings 50: 51: 52: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML. 53: sub html_to_xml { 54: my($textref, $warn) = @_; 55: $warnings = $warn; 56: $result = ''; 57: @stack = (); 58: $close_warning = ''; 59: my $p = HTML::Parser->new( api_version => 3, 60: start_h => [\&start, "tagname, attr, attrseq"], 61: end_h => [\&end, "tagname"], 62: text_h => [\&text, "dtext"], 63: comment_h => [\&comment, "tokens"], 64: declaration_h => [\&declaration, "tokens"], 65: process_h => [\&process, "token0"], 66: ); 67: # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase 68: $p->empty_element_tags(1); 69: $result .= "<?xml version='1.0' encoding='UTF-8'?>\n"; 70: $p->parse($$textref); 71: for (my $i=scalar(@stack)-1; $i>=0; $i--) { 72: if ($close_warning ne '') { 73: $close_warning .= ', '; 74: } 75: $close_warning .= $stack[$i]; 76: $result .= '</'.$stack[$i].'>'; 77: } 78: if ($warnings && $close_warning ne '') { 79: print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n"; 80: } 81: return \$result; 82: } 83: 84: sub start { 85: my($tagname, $attr, $attrseq) = @_; 86: 87: # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files 88: # (in problem files there are not so many missing tags) 89: # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser 90: 91: if ($tagname eq 'o:p') { 92: return; 93: } 94: 95: if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname> 96: $result .= "<$tagname>"; 97: return; 98: } 99: 100: #$tagname = lc($tagname); this is done by default by the parser 101: $tagname = fix_tag($tagname); 102: if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' && 103: !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) { 104: # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse 105: # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><TD... 106: # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is) 107: if ($warnings) { 108: print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n"; 109: } 110: start('td'); 111: } 112: if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') { 113: end('p'); 114: } elsif ($tagname eq 'li') { 115: my $ind_li = last_index_of(\@stack, 'li'); 116: my $ind_ul = last_index_of(\@stack, 'ul'); 117: my $ind_ol = last_index_of(\@stack, 'ol'); 118: if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) { 119: end('li'); 120: } 121: } elsif ($tagname eq 'tr') { 122: my $ind_table = last_index_of(\@stack, 'table'); 123: my $ind_tr = last_index_of(\@stack, 'tr'); 124: if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) { 125: end('tr'); 126: } 127: } elsif ($tagname eq 'td' || $tagname eq 'th') { 128: my $ind_table = last_index_of(\@stack, 'table'); 129: my $ind_td = last_index_of(\@stack, 'td'); 130: my $ind_th = last_index_of(\@stack, 'th'); 131: my $ind_tr = last_index_of(\@stack, 'tr'); 132: if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) { 133: start('tr'); 134: $ind_tr = last_index_of(\@stack, 'tr'); 135: } 136: if ($ind_td != -1 && $ind_tr < $ind_td) { 137: end('td'); 138: } elsif ($ind_th != -1 && $ind_tr < $ind_th) { 139: end('th'); 140: } 141: } elsif ($tagname eq 'dd' || $tagname eq 'dt') { 142: my $ind_dd = last_index_of(\@stack, 'dd'); 143: my $ind_dt = last_index_of(\@stack, 'dt'); 144: my $ind_dl = last_index_of(\@stack, 'dl'); 145: if ($ind_dl == -1) { 146: start('dl'); 147: $ind_dl = last_index_of(\@stack, 'dl'); 148: } 149: if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) { 150: end('dd'); 151: } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) { 152: end('dt'); 153: } 154: } elsif ($tagname eq 'option') { 155: my $ind_option = last_index_of(\@stack, 'option'); 156: if ($ind_option != -1) { 157: end('option'); 158: } 159: } elsif ($tagname eq 'area') { 160: my $ind_area = last_index_of(\@stack, 'area'); 161: if ($ind_area != -1) { 162: end('area'); 163: } 164: } elsif ($tagname eq 'a') { 165: my $ind_a = last_index_of(\@stack, 'a'); 166: if ($ind_a != -1) { 167: end('a'); 168: } 169: } elsif ($tagname eq 'num') { 170: my $ind_num = last_index_of(\@stack, 'num'); 171: if ($ind_num != -1) { 172: end('num'); 173: } 174: } 175: 176: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway). 177: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead. 178: # Paragraphs inside paragraphs will be fixed later. 179: 180: # my @styles = (); 181: # if ($tagname eq 'p') { 182: # for (my $i=scalar(@stack)-1; $i>=0; $i--) { 183: # if ($stack[$i] eq 'p') { 184: # # save the styles 185: # for (my $j=$i+1; $j<scalar(@stack); $j++) { 186: # if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) { 187: # push(@styles, $stack[$j]); 188: # } 189: # } 190: # # close the p 191: # end('p'); 192: # last; 193: # } elsif (index_of(\@block_html, $stack[$i]) != -1) { 194: # # stop looking 195: # last; 196: # } 197: # } 198: # } 199: $result .= '<'.$tagname; 200: my %seen = (); 201: foreach my $att_name (@$attrseq) { 202: my $att_name_modified = $att_name; 203: $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g; 204: $att_name_modified =~ s/^[\-.0-9]*//; 205: if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) { 206: if ($seen{$att_name_modified}) { 207: if ($warnings) { 208: print "Warning: Ignoring duplicate attribute: $att_name\n"; 209: } 210: next; 211: } 212: $seen{$att_name_modified}++; 213: my $att_value = $attr->{$att_name}; 214: $att_value =~ s/^[“”]|[“”]$//g; 215: $att_value =~ s/&/&/g; 216: $att_value =~ s/</</g; 217: $att_value =~ s/>/>/g; 218: $att_value =~ s/"/"/g; 219: if ($tagname eq 'embed' && $att_name_modified eq 'script') { 220: # newlines are encoded to preserve Protein Explorer scripts in embed script attributes: 221: $att_value =~ s/\x0A/
/g; 222: $att_value =~ s/\x0D//g; 223: } 224: if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' || 225: $att_value eq 'http://www.w3.org/TR/REC-html40')) { 226: next; 227: } 228: $result .= ' '.$att_name_modified.'="'.$att_value.'"'; 229: } 230: } 231: if (index_of(\@empty, $tagname) != -1) { 232: $result .= '/>'; 233: } else { 234: $result .= '>'; 235: push(@stack, $tagname); 236: if (scalar(@stack) > 500) { 237: die "This document has a crazy depth - I'm out !"; 238: } 239: } 240: # reopen the styles, if any 241: #for (my $j=0; $j<scalar(@styles); $j++) { 242: # start($styles[$j], {}, ()); 243: #} 244: } 245: 246: sub end { 247: my($tagname) = @_; 248: 249: if ($tagname eq 'o:p') { 250: return; 251: } 252: 253: $tagname = fix_tag($tagname); 254: if (index_of(\@empty, $tagname) != -1) { 255: return; 256: } 257: if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') { 258: # handle <th>text</td> as if it was <th>text</th> 259: $tagname = 'th'; 260: } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') { 261: # handle <td>text</th> as if it was <td>text</td> 262: $tagname = 'td'; 263: } 264: my $found = 0; 265: for (my $i=scalar(@stack)-1; $i>=0; $i--) { 266: if ($stack[$i] eq $tagname) { 267: for (my $j=scalar(@stack)-1; $j>$i; $j--) { 268: if ($close_warning ne '') { 269: $close_warning .= ', '; 270: } 271: $close_warning .= $stack[$j]; 272: $result .= '</'.$stack[$j].'>'; 273: } 274: splice(@stack, $i, scalar(@stack)-$i); 275: $found = 1; 276: last; 277: } elsif (index_of(\@stack, 'web') != -1) { 278: die "There is a web element with missing end tags inside - it has to be fixed by hand"; 279: } 280: } 281: if ($found) { 282: $result .= '</'.$tagname.'>'; 283: } elsif ($tagname eq 'p') { 284: $result .= ''; 285: } 286: } 287: 288: sub text { 289: my($dtext) = @_; 290: $dtext =~ s/&/&/g; 291: $dtext =~ s/</</g; 292: $dtext =~ s/>/>/g; 293: $dtext =~ s/"/"/g; 294: $result .= $dtext; 295: } 296: 297: sub comment { 298: my($tokens) = @_; 299: # NOTE: the HTML parser thinks this is a comment: 300: # and LON-CAPA has sometimes turned that into <![CDATA[]]> 301: foreach my $comment (@$tokens) { 302: $comment =~ s/--/- /g; 303: $comment =~ s/^-|-$/ /g; 304: $result .= ''; 305: } 306: } 307: 308: sub declaration { 309: my($tokens) = @_; 310: # ignore them 311: #$result .= '<!'; 312: #$result .= join(' ', @$tokens); 313: #$result .= '>'; 314: } 315: 316: sub process { 317: my($token0) = @_; 318: if ($token0 ne '') { 319: $result .= '<?'.$token0.'>'; 320: } 321: } 322: 323: sub index_of { 324: my ($array, $value) = @_; 325: for (my $i=0; $i<scalar(@{$array}); $i++) { 326: if ($array->[$i] eq $value) { 327: return $i; 328: } 329: } 330: return -1; 331: } 332: 333: sub last_index_of { 334: my ($array, $value) = @_; 335: for (my $i=scalar(@{$array})-1; $i>=0; $i--) { 336: if ($array->[$i] eq $value) { 337: return $i; 338: } 339: } 340: return -1; 341: } 342: 343: sub fix_tag { 344: my ($tag) = @_; 345: #$tag = lc($tag); this is done by default by the parser 346: if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) { 347: if ($warnings) { 348: print "Warning: bad start tag:'".$tag."'"; 349: } 350: if ($tag =~ /<[a-zA-Z]/) { 351: $tag =~ s/^[^<]*<//; # a b 352: } 353: if ($tag =~ /[a-zA-Z]=/) { 354: $tag =~ s/=.*$//; # a=b -> a 355: } 356: if ($tag =~ /[a-zA-Z]\//) { 357: $tag =~ s/\/.*$//; # a/b -> a 358: } 359: if ($tag =~ /:/) { 360: # a:b -> b except when : at the end 361: if ($tag =~ /^[^:]*:$/) { 362: $tag =~ s/://; 363: } else { 364: $tag =~ s/^.*://; 365: } 366: } 367: $tag =~ s/^[0-9\-\.]+//; 368: $tag =~ s/[^a-zA-Z0-9_\-\.]//g; 369: if ($warnings) { 370: print " (converted to $tag)\n"; 371: } 372: } 373: return($tag); 374: } 375: 376: 377: ## 378: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array) 379: # @param {Array<string>} array - reference to the array of strings 380: # @param {string} value - the string to look for 381: # @returns 1 if found, 0 otherwise 382: ## 383: sub string_in_array { 384: my ($array, $value) = @_; 385: foreach my $v (@{$array}) { 386: if ($v eq $value) { 387: return 1; 388: } 389: } 390: return 0; 391: } 392: 393: 394: 1; 395: __END__