File:  [LON-CAPA] / loncom / homework / cleanxml / post_xml.pm
Revision 1.12: download - view: text, annotated - select for diffs
Tue Jan 17 20:29:06 2017 UTC (7 years, 3 months ago) by damieng
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_4_msu, HEAD
preserving m/@display, preserving m/@eval in complex cases

    1: # The LearningOnline Network
    2: # Third step to clean a file.
    3: #
    4: # $Id: post_xml.pm,v 1.12 2017/01/17 20:29:06 damieng Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ###
   29: 
   30: #!/usr/bin/perl
   31: 
   32: package Apache::post_xml;
   33: 
   34: use strict;
   35: use utf8;
   36: use warnings;
   37: 
   38: use File::Basename;
   39: use File::Temp qw/ tempfile /;
   40: use Cwd 'abs_path';
   41: use XML::LibXML;
   42: use HTML::TokeParser; # used to parse sty files
   43: use Tie::IxHash; # for ordered hashes
   44: use tth;
   45: use Apache::html_to_xml;
   46: 
   47: no warnings 'recursion'; # yes, fix_paragraph is using heavy recursion, I know
   48: 
   49: # these are constants
   50: my @block_elements = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm');
   51: my @inline_like_block = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup','translated','lang'); # inline elements treated like blocks for pretty print and some other things
   52: my @responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse','essayresponse','radiobuttonresponse','matchresponse','rankresponse','imageresponse','functionplotresponse');
   53: my @block_html = ('html','head','body','section','h1','h2','h3','h4','h5','h6','div','p','ul','ol','li','table','tbody','tr','td','th','dl','dt','dd','pre','noscript','hr','address','blockquote','object','applet','embed','map','form','fieldset','iframe','center','frameset');
   54: my @no_newline_inside = ('import','parserlib','scriptlib','data','function','label','xlabel','ylabel','tic','text','rectangle','image','title','h1','h2','h3','h4','h5','h6','li','td','p');
   55: my @preserve_elements = ('script','answer','pre','style');
   56: my @accepting_style = ('section','h1','h2','h3','h4','h5','h6','div','p','li','td','th','dt','dd','pre','blockquote');
   57: my @latex_math = ('\alpha', '\theta', '\omicron', '\tau', '\beta', '\vartheta', '\pi', '\upsilon', '\gamma', '\gamma', '\varpi', '\phi', '\delta', '\kappa', '\rho', '\varphi', '\epsilon', '\lambda', '\varrho', '\chi', '\varepsilon', '\mu', '\sigma', '\psi', '\zeta', '\nu', '\varsigma', '\omega', '\eta', '\xi',
   58:   '\Gamma', '\Lambda', '\Sigma', '\Psi', '\Delta', '\Xi', '\Upsilon', '\Omega', '\Theta', '\Pi', '\Phi',
   59:   '\pm', '\cap', '\diamond', '\oplus', '\mp', '\cup', '\bigtriangleup', '\ominus', '\times', '\uplus', '\bigtriangledown', '\otimes', '\div', '\sqcap', '\triangleleft', '\oslash', '\ast', '\sqcup', '\triangleright', '\odot', '\star', '\vee', '\lhd$', '\bigcirc', '\circ', '\wedge', '\rhd$', '\dagger', '\bullet', '\setminus', '\unlhd$', '\ddagger', '\cdot', '\wr', '\unrhd$', '\amalg', '+', '-',
   60:   '\leq', '\geq', '\equiv', '\models', '\prec', '\succ', '\sim', '\perp', '\preceq', '\succeq', '\simeq', '\mid', '\ll', '\gg', '\asymp', '\parallel', '\subset', '\supset', '\approx', '\bowtie', '\subseteq', '\supseteq', '\cong', '\Join$', '\sqsubset$', '\sqsupset$', '\neq', '\smile', '\sqsubseteq', '\sqsupseteq', '\doteq', '\frown', '\in', '\ni', '\propto', '\vdash', '\dashv',
   61:   '\colon', '\ldotp', '\cdotp',
   62:   '\leftarrow', '\longleftarrow', '\uparrow', '\Leftarrow', '\Longleftarrow', '\Uparrow', '\rightarrow', '\longrightarrow', '\downarrow', '\Rightarrow', '\Longrightarrow', '\Downarrow', '\leftrightarrow', '\longleftrightarrow', '\updownarrow', '\Leftrightarrow', '\Longleftrightarrow', '\Updownarrow', '\mapsto', '\longmapsto', '\nearrow', '\hookleftarrow', '\hookrightarrow', '\searrow', '\leftharpoonup', '\rightharpoonup', '\swarrow', '\leftharpoondown', '\rightharpoondown', '\nwarrow', '\rightleftharpoons', '\leadsto$',
   63:   '\ldots', '\cdots', '\vdots', '\ddots', '\aleph', '\prime', '\forall', '\infty', '\hbar', '\emptyset', '\exists', '\Box$', '\imath', '\nabla', '\neg', '\Diamond$', '\jmath', '\surd', '\flat', '\triangle', '\ell', '\top', '\natural', '\clubsuit', '\wp', '\bot', '\sharp', '\diamondsuit', '\Re', '\|', '\backslash', '\heartsuit', '\Im', '\angle', '\partial', '\spadesuit', '\mho$',
   64:   '\sum', '\bigcap', '\bigodot', '\prod', '\bigcup', '\bigotimes', '\coprod', '\bigsqcup', '\bigoplus', '\int', '\bigvee', '\biguplus', '\oint', '\bigwedge',
   65:   '\arccos', '\cos', '\csc', '\exp', '\ker', '\limsup', '\min', '\sinh', '\arcsin', '\cosh', '\deg', '\gcd', '\lg', '\ln', '\Pr', '\sup', '\arctan', '\cot', '\det', '\hom', '\lim', '\log', '\sec', '\tan', '\arg', '\coth', '\dim', '\inf', '\liminf', '\max', '\sin', '\tanh',
   66:   '\uparrow', '\Uparrow', '\downarrow', '\Downarrow', '\updownarrow', '\Updownarrow', '\lfloor', '\rfloor', '\lceil', '\rceil', '\langle', '\rangle', '\backslash',
   67:   '\rmoustache', '\lmoustache', '\rgroup', '\lgroup', '\arrowvert', '\Arrowvert', '\bracevert',
   68:   '\hat{', '\acute{', '\bar{', '\dot{', '\breve{', '\check{', '\grave{', '\vec{', '\ddot{', '\tilde{',
   69:   '\widetilde{', '\widehat{', '\overleftarrow{', '\overrightarrow{', '\overline{', '\underline{', '\overbrace{', '\underbrace{', '\sqrt{', '\sqrt[', '\frac{'
   70: );
   71: # list of elements that can contain style elements:
   72: my @containing_styles = ('library','problem',@responses,'foil','item','text','hintgroup','hintpart','label','part','preduedate','postanswerdate','solved','notsolved','block','while','web','standalone','problemtype','languageblock','translated','lang','window','windowlink','togglebox','instructorcomment','body','section','div','p','li','dd','td','th','blockquote','object','applet','video','audio','canvas','fieldset','button',
   73: 'span','strong','em','b','i','sup','sub','code','kbd','samp','tt','ins','del','var','small','big','u','font');
   74: my @html_styles = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'u');
   75: 
   76: my $warnings; # 1 = print warnings
   77: 
   78: 
   79: # Parses the XML document and fixes many things to turn it into a document matching the schema.
   80: # @param {reference} textref - reference to the text of the document
   81: # @param {string} file_path - path of the file being processed (we only extract the directory path from it)
   82: # @param {string} res_dir - path of res directory parent (without the / at the end)
   83: # @param {boolean} warn - 1 to print warnings, 0 otherwise
   84: # @returns the text of the document as a byte string.
   85: sub post_xml {
   86:   my ($textref, $file_path, $res_dir, $warn) = @_;
   87:   $warnings = $warn;
   88:   
   89:   my $dom_doc = XML::LibXML->load_xml(string => $textref);
   90: 
   91:   my $root = fix_structure($dom_doc);
   92: 
   93:   remove_elements($root, ['startouttext','startoutext','startottext','startouttex','startouttect','atartouttext','starouttext','starttextout','starttext','starttextarea','endouttext','endoutext','endoutttext','endouttxt','endouutext','ednouttext','endouttex','endoouttext','endouttest','endtextout','endtextarea','startpartmarker','endpartmarker','basefont','x-claris-tagview','x-claris-window','x-sas-window']);
   94:   
   95:   remove_empty_attributes($root);
   96:   
   97:   fix_attribute_case($root);
   98:   
   99:   replace_m($root);
  100:   
  101:   my @all_block = (@block_elements, @block_html);
  102:   add_sty_blocks($file_path, $res_dir, $root, \@all_block); # must come before the subs using @all_block
  103: 
  104:   fix_block_styles($root, \@all_block);
  105:   $root->normalize();
  106:   
  107:   fix_fonts($root, \@all_block);
  108:   
  109:   replace_u($root);
  110: 
  111:   remove_bad_cdata_sections($root);
  112:   
  113:   add_cdata_sections($root);
  114:   
  115:   fix_style_element($root);
  116:   
  117:   fix_tables($root);
  118: 
  119:   fix_lists($root);
  120:   
  121:   fix_wrong_name_for_img($root); # should be before replace_deprecated_attributes_by_css
  122: 
  123:   replace_deprecated_attributes_by_css($root);
  124:   
  125:   replace_center($root, \@all_block); # must come after replace_deprecated_attributes_by_css
  126:   
  127:   replace_nobr($root);
  128:   
  129:   remove_useless_notsolved($root);
  130:   
  131:   fix_comments($root);
  132:   
  133:   fix_paragraphs_inside($root, \@all_block);
  134: 
  135:   remove_empty_style($root);
  136:   
  137:   fix_empty_lc_elements($root);
  138:   
  139:   reduce_empty_p($root);
  140:   
  141:   lowercase_attribute_values($root);
  142:   
  143:   replace_numericalresponse_unit_attribute($root);
  144:   
  145:   replace_functions_by_elements($root);
  146:   
  147:   pretty($root, \@all_block);
  148: 
  149:   replace_tm_dtm($root);
  150:   
  151:   return $dom_doc->toString(); # byte string !
  152: }
  153: 
  154: sub fix_structure {
  155:   my ($doc) = @_;
  156:   # the root element has already been added in pre_xml
  157:   my $root = $doc->documentElement;
  158:   # inside the root, replace html, problem and library elements by their content
  159:   my @toreplace = ('html','problem','library','Task');
  160:   foreach my $name (@toreplace) {
  161:     my @elements = $root->getElementsByTagName($name);
  162:     foreach my $element (@elements) {
  163:       replace_by_children($element);
  164:     }
  165:   }
  166:   # insert all link and style elements inside a new head element
  167:   my $current_node = undef;
  168:   my @heads = $doc->getElementsByTagName('head');
  169:   my @links = $doc->getElementsByTagName('link');
  170:   my @styles = $doc->getElementsByTagName('style');
  171:   my @titles = $doc->getElementsByTagName('title');
  172:   if (scalar(@titles) > 0) {
  173:     # NOTE: there is a title element in gnuplot, not to be confused with the one inside HTML head
  174:     for (my $i=0; $i<scalar(@titles); $i++) {
  175:       my $title = $titles[$i];
  176:       my $found_gnuplot = 0;
  177:       my $ancestor = $title->parentNode;
  178:       while (defined $ancestor) {
  179:         if ($ancestor->nodeName eq 'gnuplot') {
  180:           $found_gnuplot = 1;
  181:           last;
  182:         }
  183:         $ancestor = $ancestor->parentNode;
  184:       }
  185:       if ($found_gnuplot) {
  186:         splice(@titles, $i, 1);
  187:         $i--;
  188:       }
  189:     }
  190:   }
  191:   if (scalar(@heads) > 0 || scalar(@titles) > 0 || scalar(@links) > 0 || scalar(@styles) > 0) {
  192:     my $htmlhead = $doc->createElement('head');
  193:     foreach my $head (@heads) {
  194:       my $next;
  195:       for (my $child=$head->firstChild; defined $child; $child=$next) {
  196:         $next = $child->nextSibling;
  197:         $head->removeChild($child);
  198:         if ($child->nodeType != XML_ELEMENT_NODE ||
  199:             string_in_array(['title','script','style','meta','link','import','base'], $child->nodeName)) {
  200:           $htmlhead->appendChild($child);
  201:         } else {
  202:           # this should not be in head
  203:           insert_after_or_first($root, $child, $current_node);
  204:         }
  205:       }
  206:       $head->parentNode->removeChild($head);
  207:     }
  208:     foreach my $child (@titles, @links, @styles) {
  209:       $child->parentNode->removeChild($child);
  210:       $htmlhead->appendChild($child);
  211:     }
  212:     insert_after_or_first($root, $htmlhead, $current_node);
  213:     $current_node = $htmlhead;
  214:   }
  215:   # body
  216:   my $htmlbody = undef;
  217:   my @bodies = $doc->getElementsByTagName('body');
  218:   if (scalar(@bodies) > 0) {
  219:     # TODO: fix content and position of body elements
  220:     if ($root->nodeName eq 'problem') {
  221:       foreach my $body (@bodies) {
  222:         replace_by_children($body);
  223:       }
  224:     }
  225:   }
  226:   # add all the meta elements afterwards when they are LON-CAPA meta. Remove all HTML meta.
  227:   my @meta_names = ('abstract','author','authorspace','avetries','avetries_list','clear','comefrom','comefrom_list','copyright','correct','count','course','course_list','courserestricted','creationdate','dependencies','depth','difficulty','difficulty_list','disc','disc_list','domain','end','field','firstname','generation','goto','goto_list','groupname','helpful','highestgradelevel','hostname','id','keynum','keywords','language','lastname','lastrevisiondate','lowestgradelevel','middlename','mime','modifyinguser','notes','owner','permanentemail','scope','sequsage','sequsage_list','standards','start','stdno','stdno_list','subject','technical','title','url','username','value','version');
  228:   my @metas = $doc->getElementsByTagName('meta');
  229:   foreach my $meta (@metas) {
  230:     $meta->parentNode->removeChild($meta);
  231:     my $name = $meta->getAttribute('name');
  232:     my $content = $meta->getAttribute('content');
  233:     if (defined $name && defined $content && string_in_array(\@meta_names, lc($name))) {
  234:       my $lcmeta = $doc->createElement('meta');
  235:       $lcmeta->setAttribute('name', lc($name));
  236:       $lcmeta->setAttribute('content', $content);
  237:       insert_after_or_first($root, $lcmeta, $current_node);
  238:       $current_node = $lcmeta;
  239:     }
  240:   }
  241:   return($root);
  242: }
  243: 
  244: # insert the new child under parent after the reference child, or as the first child if the reference child is not defined
  245: sub insert_after_or_first {
  246:   my ($parent, $newchild, $refchild) = @_;
  247:   if (defined $refchild) {
  248:     $parent->insertAfter($newchild, $refchild);
  249:   } elsif (defined $parent->firstChild) {
  250:     $parent->insertBefore($newchild, $parent->firstChild);
  251:   } else {
  252:     $parent->appendChild($newchild);
  253:   }
  254: }
  255: 
  256: # removes all elements with given names inside the node, but keep the content
  257: sub remove_elements {
  258:   my ($node, $to_remove) = @_;
  259:   my $nextChild;
  260:   for (my $child=$node->firstChild; defined $child; $child=$nextChild) {
  261:     $nextChild = $child->nextSibling;
  262:     my $type = $node->nodeType;
  263:     if ($type == XML_ELEMENT_NODE) {
  264:       if (string_in_array($to_remove, $child->nodeName)) {
  265:         my $first_non_white = $child->firstChild;
  266:         if (defined $first_non_white && $first_non_white->nodeType == XML_TEXT_NODE &&
  267:             $first_non_white->nodeValue =~ /^\s*$/) {
  268:           $first_non_white = $first_non_white->nextSibling;
  269:         }
  270:         if (defined $first_non_white) {
  271:           $nextChild = $first_non_white;
  272:           replace_by_children($child);
  273:         } else {
  274:           $node->removeChild($child);
  275:         }
  276:       } else {
  277:         remove_elements($child, $to_remove);
  278:       }
  279:     }
  280:   }
  281: }
  282: 
  283: # removes some attributes that have an invalid empty value
  284: sub remove_empty_attributes {
  285:   my ($root) = @_;
  286:   my $doc = $root->ownerDocument;
  287:   # this list is based on validation errors in the MSU subset (it could be more complete if it was based on the schema)
  288:   my @attributes = (
  289:     ['curve', ['pointsize']],
  290:     ['foil', ['location']],
  291:     ['foilgroup', ['checkboxoptions', 'options', 'texoptions']],
  292:     ['gnuplot', ['pattern', 'texwidth']],
  293:     ['img', ['height', 'texheight', 'texwidth', 'texwrap', 'width']],
  294:     ['import', ['importmode']],
  295:     ['optionresponse', ['max']],
  296:     ['organicstructure', ['options']],
  297:     ['radiobuttonresponse', ['max']],
  298:     ['randomlabel', ['height', 'texwidth', 'width']],
  299:     ['stringresponse', ['type']],
  300:     ['textline', ['size']],
  301:   );
  302:   foreach my $element_attributes (@attributes) {
  303:     my $element_name = $element_attributes->[0];
  304:     my $attribute_names = $element_attributes->[1];
  305:     my @elements = $doc->getElementsByTagName($element_name);
  306:     foreach my $element (@elements) {
  307:       foreach my $attribute_name (@$attribute_names) {
  308:         my $value = $element->getAttribute($attribute_name);
  309:         if (defined $value && $value =~ /^\s*$/) {
  310:           $element->removeAttribute($attribute_name);
  311:         }
  312:       }
  313:     }
  314:   }
  315: }
  316: 
  317: # fixes the case for a few attributes that are not all lowercase
  318: # (the HTML parser used in html_to_xml turns everything lowercase, which is a good thing in general)
  319: sub fix_attribute_case {
  320:   my ($root) = @_;
  321:   my $doc = $root->ownerDocument;
  322:   my @attributes = (
  323:     ['labelgroup', ['TeXsize']],
  324:     ['h1', ['TeXsize']],
  325:     ['h2', ['TeXsize']],
  326:     ['h3', ['TeXsize']],
  327:     ['h4', ['TeXsize']],
  328:     ['h5', ['TeXsize']],
  329:     ['h6', ['TeXsize']],
  330:     # font and basefont have a TeXsize but will be removed
  331:     ['optionresponse', ['TeXlayout']],
  332:     ['itemgroup', ['TeXitemgroupwidth']],
  333:     ['Task', ['OptionalRequired']],
  334:     ['Question', ['OptionalRequired','Mandatory']],
  335:     ['Instance', ['OptionalRequired','Disabled']],
  336:     ['Criteria', ['Mandatory']],
  337:     ['table', ['TeXwidth','TeXtheme']],
  338:     ['td', ['TeXwidth']],
  339:     ['th', ['TeXwidth']],
  340:     ['img', ['TeXwidth','TeXheight','TeXwrap']],
  341:   );
  342:   foreach my $element_attributes (@attributes) {
  343:     my $element_name = $element_attributes->[0];
  344:     my $attribute_names = $element_attributes->[1];
  345:     my @elements = $doc->getElementsByTagName($element_name);
  346:     foreach my $element (@elements) {
  347:       foreach my $attribute_name (@$attribute_names) {
  348:         my $value = $element->getAttribute(lc($attribute_name));
  349:         if (defined $value) {
  350:           $element->removeAttribute(lc($attribute_name));
  351:           $element->setAttribute($attribute_name, $value);
  352:         }
  353:       }
  354:     }
  355:   }
  356: }
  357: 
  358: # Replaces m by HTML, tm and/or dtm (which will be replaced by <m> later, but they are useful
  359: #   to know if the element is a block element or not).
  360: # m might contain non-math LaTeX, while tm and dtm may only contain math.
  361: sub replace_m {
  362:   my ($root) = @_;
  363:   my $doc = $root->ownerDocument;
  364:   # search for variable declarations
  365:   my @variables = ();
  366:   my @scripts = $root->getElementsByTagName('script');
  367:   foreach my $script (@scripts) {
  368:     my $type = $script->getAttribute('type');
  369:     if (defined $type && $type eq 'loncapa/perl') {
  370:       if (defined $script->firstChild && $script->firstChild->nodeType == XML_TEXT_NODE) {
  371:         my $text = $script->firstChild->nodeValue;
  372:         # NOTE: we are not interested in replacing "@value", only "$value"
  373:         # this regexp is for "  $a = ..." and "  $a[...] = ..."
  374:         while ($text =~ /^[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
  375:           if (!string_in_array(\@variables, $1)) {
  376:             push(@variables, $1);
  377:           }
  378:         }
  379:         # this regexp is for "...;  $a = ..." and "...;  $a[...] = ..."
  380:         while ($text =~ /^[^'"\/;]+;[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
  381:           if (!string_in_array(\@variables, $1)) {
  382:             push(@variables, $1);
  383:           }
  384:         }
  385:         # this regexp is for "  @a = ..."
  386:         while ($text =~ /^[ \t]*\@([a-zA-Z_0-9]+)[ \t]*=/gm) {
  387:           if (!string_in_array(\@variables, $1)) {
  388:             push(@variables, $1);
  389:           }
  390:         }
  391:         # this regexp is for "  ($a, $b, $c) = ..."
  392:         my @matches = ($text =~ /^[ \t]*\([ \t]*\$([a-zA-Z_0-9]+)(?:[ \t]*,[ \t]*\$([a-zA-Z_0-9]+))*[ \t]*\)[ \t]*=/gm);
  393:         foreach my $match (@matches) {
  394:           if (!defined $match) {
  395:             next; # not sure why it happens, but it does
  396:           }
  397:           if (!string_in_array(\@variables, $match)) {
  398:             push(@variables, $match);
  399:           }
  400:         }
  401:         # and this one is for "push @a"
  402:         while ($text =~ /^[ \t]*push @([a-zA-Z_0-9]+)[ \t,]*/gm) {
  403:           if (!string_in_array(\@variables, $1)) {
  404:             push(@variables, $1);
  405:           }
  406:         }
  407:         # use the opportunity to report usage of <m> in Perl scripts
  408:         if ($text =~ /^[^#].*<m[ >]/m) {
  409:           if ($warnings) {
  410:             print "WARNING: <m> is used in a script, it should be converted by hand\n";
  411:           }
  412:         }
  413:       }
  414:     }
  415:   }
  416:   my @ms = $root->getElementsByTagName('m');
  417:   foreach my $m (@ms) {
  418:     if (!defined $m->firstChild) {
  419:       $m->parentNode->removeChild($m);
  420:       next;
  421:     }
  422:     if (defined $m->firstChild->nextSibling || $m->firstChild->nodeType != XML_TEXT_NODE) {
  423:       if ($warnings) {
  424:         print "WARNING: m value is not simple text\n";
  425:       }
  426:       next;
  427:     }
  428:     my $text = $m->firstChild->nodeValue;
  429:     my $text_before_variable_replacement = $text;
  430:     my $var_key1 = 'dfhg3df54hg65hg4';
  431:     my $var_key2 = 'dfhg654d6f5g4h5f';
  432:     my $eval = defined $m->getAttribute('eval') && $m->getAttribute('eval') eq 'on';
  433:     my $display = $m->getAttribute('display');
  434:     if (defined $display) {
  435:         if ($display eq '') {
  436:             $display = undef;
  437:         }
  438:         if (lc($display) eq 'jsmath') {
  439:             $display = 'mathjax';
  440:         }
  441:     }
  442:     if ($eval) {
  443:       # replace variables
  444:       foreach my $variable (@variables) {
  445:         my $replacement = $var_key1.$variable.$var_key2;
  446:         $text =~ s/\$$variable(?![a-zA-Z])/$replacement/ge;
  447:         $text =~ s/\$\{$variable\}/$replacement/ge;
  448:       }
  449:     }
  450:     # check if the expression is enclosed in math separators: $ $$ \( \) \[ \]
  451:     # if so, replace the whole node by dtm or tm
  452:     my $new_text;
  453:     my $new_node_name;
  454:     if ($text =~ /^\s*\$\$([^\$]*)\$\$\s*$/) {
  455:       $new_node_name = 'dtm';
  456:       $new_text = $1;
  457:     } elsif ($text =~ /^\s*\\\[(.*)\\\]\s*$/s) {
  458:       $new_node_name = 'dtm';
  459:       $new_text = $1;
  460:     } elsif ($text =~ /^\s*\$([^\$]*)\$\s*$/) {
  461:       $new_node_name = 'tm';
  462:       $new_text = $1;
  463:     } elsif ($text =~ /^\s*\\\((.*)\\\)\s*$/s) {
  464:       $new_node_name = 'tm';
  465:       $new_text = $1;
  466:     }
  467:     if (defined $new_node_name) {
  468:       if ($eval) {
  469:         foreach my $variable (@variables) {
  470:           my $replacement = $var_key1.$variable.$var_key2;
  471:           $new_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
  472:           $new_text =~ s/$replacement/\$$variable/g;
  473:         }
  474:       }
  475:       my $new_node = $doc->createElement($new_node_name);
  476:       if ($eval) {
  477:         $new_node->setAttribute('eval', 'on');
  478:       }
  479:       if (defined $display) {
  480:         $new_node->setAttribute('display', $display);
  481:       }
  482:       $new_node->appendChild($doc->createTextNode($new_text));
  483:       $m->parentNode->replaceChild($new_node, $m);
  484:       next;
  485:     }
  486:     if ($text !~ /\$|\\\(|\\\)|\\\[|\\\]/) {
  487:       # there are no math separators inside
  488:       # try to guess if this is meant as math
  489:       my $found_math = 0;
  490:       foreach my $symbol (@latex_math) {
  491:         if (index($text, $symbol) != -1) {
  492:           $found_math = 1;
  493:           last;
  494:         }
  495:       }
  496:       if ($found_math) {
  497:         # interpret the whole text as LaTeX inline math
  498:         my $new_node = $doc->createElement('tm');
  499:         if ($eval) {
  500:           $new_node->setAttribute('eval', 'on');
  501:         }
  502:         $new_node->appendChild($doc->createTextNode($text_before_variable_replacement));
  503:         $m->parentNode->replaceChild($new_node, $m);
  504:         next;
  505:       }
  506:       # no math symbol found, we will convert the text with tth
  507:     }
  508:     
  509:     # there are math separators inside, even after hiding variables, or there was no math symbol
  510:     
  511:     # hide math parts inside before running tth
  512:     my $math_key1 = '#5752398247516385';
  513:     my $math_key2 = '#';
  514:     my @maths = ();
  515:     my @separators = (['$$','$$'], ['\\(','\\)'], ['\\[','\\]'], ['$','$']);
  516:     foreach my $seps (@separators) {
  517:       my $sep1 = $seps->[0];
  518:       my $sep2 = $seps->[1];
  519:       my $pos1 = index($text, $sep1);
  520:       if ($pos1 == -1) {
  521:         next;
  522:       }
  523:       my $pos2 = index($text, $sep2, $pos1+length($sep1));
  524:       while ($pos1 != -1 && $pos2 != -1) {
  525:         my $replace = substr($text, $pos1, $pos2+length($sep2)-$pos1);
  526:         push(@maths, $replace);
  527:         my $by = $math_key1.scalar(@maths).$math_key2;
  528:         $text = substr($text, 0, $pos1).$by.substr($text, $pos2+length($sep2));
  529:         $pos1 = index($text, $sep1);
  530:         if ($pos1 != -1) {
  531:           $pos2 = index($text, $sep2, $pos1+length($sep1));
  532:         }
  533:       }
  534:     }
  535:     # get HTML as text from tth
  536:     my $html_text = tth($text);
  537:     # replace math by replacements
  538:     for (my $i=0; $i < scalar(@maths); $i++) {
  539:       my $math = $maths[$i];
  540:       $math =~ s/&/&amp;/g;
  541:       $math =~ s/</&lt;/g;
  542:       $math =~ s/>/&gt;/g;
  543:       my ($mel, $inside);
  544:       if ($math =~ /^\$\$(.*)\$\$$/s) {
  545:         $mel = 'dtm';
  546:         $inside = $1;
  547:       } elsif ($math =~ /^\\\[(.*)\\\]$/s) {
  548:         $mel = 'dtm';
  549:         $inside = $1;
  550:       } elsif ($math =~ /^\\\((.*)\\\)$/s) {
  551:         $mel = 'tm';
  552:         $inside = $1;
  553:       } elsif ($math =~ /^\$(.*)\$$/s) {
  554:         $mel = 'tm';
  555:         $inside = $1;
  556:       }
  557:       if (defined $inside) {
  558:         if ($inside =~ /^\s*$/) {
  559:           $math = '';
  560:         } else {
  561:           $math = '<'.$mel;
  562:           if ($eval && $inside =~ /$var_key1/) {
  563:             $math .= ' eval="on"';
  564:           }
  565:           $math .= '>'.$inside.'</'.$mel.'>';
  566:         }
  567:       }
  568:       my $replace = $math_key1.($i+1).$math_key2;
  569:       $html_text =~ s/$replace/$math/;
  570:     }
  571:     # replace variables if necessary
  572:     if ($eval) {
  573:       foreach my $variable (@variables) {
  574:         my $replacement = $var_key1.$variable.$var_key2;
  575:         $html_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
  576:         $html_text =~ s/$replacement/\$$variable/g;
  577:       }
  578:     }
  579:     my $fragment = html_to_dom($html_text);
  580:     $doc->adoptNode($fragment);
  581:     $m->parentNode->replaceChild($fragment, $m);
  582:     
  583:   }
  584: }
  585: 
  586: # Returns the HTML equivalent of LaTeX input, using tth
  587: sub tth {
  588:   my ($text) = @_;
  589:   my $output = &tth::tth($text);
  590:   my $errorstring = &tth::ttherror();
  591:   if ($errorstring) {
  592:     die $errorstring;
  593:   }
  594:   # hopefully the temp file will not be removed before this point (otherwise we should use unlink_on_destroy 0)
  595:   $output =~ s/^\s*|\s*$//;
  596:   $output =~ s/<div class="p"><!----><\/div>/<br\/>/; # why is tth using such ugly markup for \newline ?
  597:   return $output;
  598: }
  599: 
  600: # transform simple HTML into a DOM fragment (which will need to be adopted by the document)
  601: sub html_to_dom {
  602:   my ($text) = @_;
  603:   $text = '<root>'.$text.'</root>';
  604:   my $textref = Apache::html_to_xml::html_to_xml(\$text);
  605:   utf8::upgrade($$textref); # otherwise the XML parser fails when the HTML parser turns &nbsp; into a character
  606:   my $dom_doc = XML::LibXML->load_xml(string => $textref);
  607:   my $root = $dom_doc->documentElement;
  608:   remove_empty_style($root);
  609:   my $fragment = $dom_doc->createDocumentFragment();
  610:   my $next;
  611:   for (my $n=$root->firstChild; defined $n; $n=$next) {
  612:     $next = $n->nextSibling;
  613:     $root->removeChild($n);
  614:     $fragment->appendChild($n);
  615:   }
  616:   return($fragment);
  617: }
  618: 
  619: # Use the linked sty files to guess which newly defined elements should be considered blocks.
  620: # Also adds to @containing_styles the sty elements that contain styles.
  621: # @param {string} fn - the file path (we only extract the directory path from it)
  622: # @param {string} res_dir - path of res directory parent (without the / at the end)
  623: # @param {Element} root - the root element
  624: sub add_sty_blocks {
  625:   my ($fn, $res_dir, $root, $all_block) = @_;
  626:   my $doc = $root->ownerDocument;
  627:   my @parserlibs = $doc->getElementsByTagName('parserlib');
  628:   my @libs = ();
  629:   foreach my $parserlib (@parserlibs) {
  630:     if (defined $parserlib->firstChild && $parserlib->firstChild->nodeType == XML_TEXT_NODE) {
  631:       my $value = $parserlib->firstChild->nodeValue;
  632:       $value =~ s/^\s+|\s+$//g;
  633:       if ($value ne '') {
  634:         push(@libs, $value);
  635:       }
  636:     }
  637:   }
  638:   my ($name, $path, $suffix) = fileparse($fn);
  639:   foreach my $sty (@libs) {
  640:     if (substr($sty, 0, 1) eq '/') {
  641:       $sty = $res_dir.$sty;
  642:     } else {
  643:       $sty = $path.$sty;
  644:     }
  645:     my $new_elements = parse_sty($sty, $all_block);
  646:     better_guess($root, $new_elements, $all_block);
  647:     my $new_blocks = $new_elements->{'block'};
  648:     my $new_inlines = $new_elements->{'inline'};
  649:     push(@$all_block, @{$new_blocks});
  650:     #push(@inlines, @{$new_inlines}); # we are not using a list of inline elements at this point
  651:   }
  652: }
  653: 
  654: ##
  655: # Parses a sty file and returns lists of block and inline elements.
  656: # @param {string} fn - the file path
  657: ##
  658: sub parse_sty {
  659:   my ($fn, $all_block) = @_;
  660:   my @blocks = ();
  661:   my @inlines = ();
  662:   my $p = HTML::TokeParser->new($fn);
  663:   if (! $p) {
  664:     die "post_xml.pl: parse_sty: Error reading $fn\n";
  665:   }
  666:   $p->empty_element_tags(1);
  667:   my $in_definetag = 0;
  668:   my $in_render = 0;
  669:   my %newtags = ();
  670:   my $newtag = '';
  671:   my $is_block = 0;
  672:   while (my $token = $p->get_token) {
  673:     if ($token->[0] eq 'S') {
  674:       my $tag = lc($token->[1]);
  675:       if ($tag eq 'definetag') {
  676:         $in_definetag = 1;
  677:         $is_block = 0;
  678:         my $attributes = $token->[2];
  679:         $newtag = $attributes->{'name'};
  680:         if (substr($newtag, 0, 1) eq '/') {
  681:           $newtag = substr($newtag, 1);
  682:         }
  683:       } elsif ($in_definetag && $tag eq 'render') {
  684:         $in_render = 1;
  685:         $is_block = 0;
  686:       } elsif ($in_render) {
  687:         if (string_in_array($all_block, $tag)) {
  688:           $is_block = 1;
  689:         }
  690:       }
  691:     } elsif ($token->[0] eq 'E') {
  692:       my $tag = lc($token->[1]);
  693:       if ($tag eq 'definetag') {
  694:         $in_definetag = 0;
  695:         if (defined $newtags{$newtag}) {
  696:           $newtags{$newtag} = $newtags{$newtag} || $is_block;
  697:         } else {
  698:           $newtags{$newtag} = $is_block;
  699:         }
  700:       } elsif ($in_definetag && $tag eq 'render') {
  701:         $in_render = 0;
  702:       }
  703:     }
  704:   }
  705:   foreach $newtag (keys(%newtags)) {
  706:     if ($newtags{$newtag} == 1) {
  707:       push(@blocks, $newtag);
  708:     } else {
  709:       push(@inlines, $newtag);
  710:     }
  711:   }
  712:   return {'block'=>\@blocks, 'inline'=>\@inlines};
  713: }
  714: 
  715: ##
  716: # Marks as block the elements that contain block elements in the input file.
  717: # Also adds to @containing_styles the sty elements that contain styles.
  718: # @param {string} fn - the file path
  719: # @param {Hash<string,Array>} new_elements - contains arrays in 'block' and 'inline'
  720: ##
  721: sub better_guess {
  722:   my ($root, $new_elements, $all_block) = @_;
  723:   my $new_blocks = $new_elements->{'block'};
  724:   my $new_inlines = $new_elements->{'inline'};
  725:   
  726:   my @change = (); # change these elements from inline to block
  727:   foreach my $tag (@{$new_inlines}) {
  728:     my @nodes = $root->getElementsByTagName($tag);
  729:     NODE_LOOP: foreach my $node (@nodes) {
  730:       for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
  731:         if ($child->nodeType == XML_ELEMENT_NODE) {
  732:           if (string_in_array($all_block, $child->nodeName) || string_in_array($new_blocks, $child->nodeName)) {
  733:             push(@change, $tag);
  734:             last NODE_LOOP;
  735:           }
  736:         }
  737:       }
  738:     }
  739:   }
  740:   foreach my $inline (@change) {
  741:     my $index = 0;
  742:     $index++ until $new_inlines->[$index] eq $inline;
  743:     splice(@{$new_inlines}, $index, 1);
  744:     push(@{$new_blocks}, $inline);
  745:   }
  746:   # add to @containing_styles when a style is used inside
  747:   # NOTE: some sty elements will be added even though they should not, but if we don't do that
  748:   # all style will be removed in the sty elements.
  749:   foreach my $tag ((@{$new_blocks}, @{$new_inlines})) {
  750:     my @nodes = $root->getElementsByTagName($tag);
  751:     NODE_LOOP: foreach my $node (@nodes) {
  752:       for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
  753:         if ($child->nodeType == XML_ELEMENT_NODE) {
  754:           if (string_in_array(\@html_styles, $child->nodeName)) {
  755:             push(@containing_styles, $tag);
  756:             last NODE_LOOP;
  757:           }
  758:         }
  759:       }
  760:     }
  761:   }
  762: }
  763: 
  764: # When a style element contains a block, move the style inside the block where it is allowed.
  765: # style/block/other -> block/style/other
  766: # When a style is used where it is not allowed, move it inside its children or remove it (unless it contains only text)
  767: # element_not_containing_styles/style/other -> element_not_containing_styles/other/style (except if other is a style)
  768: # The fix is not perfect in the case of element_not_containing_styles/style1/style2/block/text (style1 will be lost):
  769: # element_not_containing_styles/style1/style2/block/text -> element_not_containing_styles/block/style2/text
  770: # (a solution to this problem would be to merge the styles in a span)
  771: # NOTE: .sty defined elements might have been added to @containing_styles by better_guess().
  772: sub fix_block_styles {
  773:   my ($element, $all_block) = @_;
  774:   my $doc = $element->ownerDocument;
  775:   if (string_in_array(\@html_styles, $element->nodeName)) {
  776:     # move spaces out of the style element
  777:     if (defined $element->firstChild && $element->firstChild->nodeType == XML_TEXT_NODE) {
  778:       my $child = $element->firstChild;
  779:       if ($child->nodeValue =~ /^(\s+)(\S.*)$/s) {
  780:         $element->parentNode->insertBefore($doc->createTextNode($1), $element);
  781:         $child->setData($2);
  782:       }
  783:     }
  784:     if (defined $element->lastChild && $element->lastChild->nodeType == XML_TEXT_NODE) {
  785:       my $child = $element->lastChild;
  786:       if ($child->nodeValue =~ /^(.*\S)(\s+)$/s) {
  787:         $element->parentNode->insertAfter($doc->createTextNode($2), $element);
  788:         $child->setData($1);
  789:       }
  790:     }
  791:     
  792:     my $found_block = 0;
  793:     for (my $child=$element->firstChild; defined $child; $child=$child->nextSibling) {
  794:       if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
  795:         $found_block = 1;
  796:         last;
  797:       }
  798:     }
  799:     my $no_style_here = !string_in_array(\@containing_styles, $element->parentNode->nodeName);
  800:     if ($found_block || $no_style_here) {
  801:       # there is a block or the style is not allowed here,
  802:       # the style element has to be replaced by its modified children
  803:       my $s; # a clone of the style
  804:       my $next;
  805:       for (my $child=$element->firstChild; defined $child; $child=$next) {
  806:         $next = $child->nextSibling;
  807:         if ($child->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $child->nodeName) ||
  808:             $child->nodeName eq 'br' || $no_style_here)) {
  809:           # avoid inverting a style with a style with $no_style_here (that would cause endless recursion)
  810:           if (!$no_style_here || (!string_in_array(\@html_styles, $child->nodeName) &&
  811:               string_in_array(\@containing_styles, $child->nodeName))) {
  812:             # block node or inline node when the style is not allowed:
  813:             # move all children inside the style, and make the style the only child
  814:             $s = $element->cloneNode();
  815:             my $next2;
  816:             for (my $child2=$child->firstChild; defined $child2; $child2=$next2) {
  817:               $next2 = $child2->nextSibling;
  818:               $child->removeChild($child2);
  819:               $s->appendChild($child2);
  820:             }
  821:             $child->appendChild($s);
  822:           }
  823:           $s = undef;
  824:         } elsif (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
  825:             $child->nodeType == XML_ELEMENT_NODE) {
  826:           # if the style is allowed, move text and inline nodes inside the style
  827:           if (!$no_style_here) {
  828:             if (!defined $s) {
  829:               $s = $element->cloneNode();
  830:               $element->insertBefore($s, $child);
  831:             }
  832:             $element->removeChild($child);
  833:             $s->appendChild($child);
  834:           }
  835:         } else {
  836:           # do not put other nodes inside the style
  837:           $s = undef;
  838:         }
  839:       }
  840:       # now replace by children and fix them
  841:       my $parent = $element->parentNode;
  842:       for (my $child=$element->firstChild; defined $child; $child=$next) {
  843:         $next = $child->nextSibling;
  844:         $element->removeChild($child);
  845:         $parent->insertBefore($child, $element);
  846:         if ($child->nodeType == XML_ELEMENT_NODE) {
  847:           fix_block_styles($child, $all_block);
  848:         }
  849:       }
  850:       $parent->removeChild($element);
  851:       return;
  852:     }
  853:   }
  854:   # otherwise fix all children
  855:   my $next;
  856:   for (my $child=$element->firstChild; defined $child; $child=$next) {
  857:     $next = $child->nextSibling;
  858:     if ($child->nodeType == XML_ELEMENT_NODE) {
  859:       fix_block_styles($child, $all_block);
  860:     }
  861:   }
  862: }
  863: 
  864: # removes empty font elements and font elements that contain at least one block element
  865: # replaces other font elements by equivalent span
  866: sub fix_fonts {
  867:   my ($root, $all_block) = @_;
  868:   my $doc = $root->ownerDocument;
  869:   my @fonts = $root->getElementsByTagName('font');
  870:   @fonts = reverse(@fonts); # to deal with the ancestor last in the case of font/font
  871:   foreach my $font (@fonts) {
  872:     my $block = 0;
  873:     for (my $child=$font->firstChild; defined $child; $child=$child->nextSibling) {
  874:       if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
  875:         $block = 1;
  876:         last;
  877:       }
  878:     }
  879:     if (!defined $font->firstChild || $block) {
  880:       # empty font or font containing block elements
  881:       # replace this node by its content
  882:       replace_by_children($font);
  883:     } else {
  884:       # replace by equivalent span
  885:       my $color = get_non_empty_attribute($font, 'color');
  886:       my $size = get_non_empty_attribute($font, 'size');
  887:       my $face = get_non_empty_attribute($font, 'face');
  888:       if (defined $face) {
  889:         $face =~ s/^,|,$//;
  890:       }
  891:       if (!defined $color && !defined $size && !defined $face) {
  892:         # useless font element: replace this node by its content
  893:         replace_by_children($font);
  894:         next;
  895:       }
  896:       my $replacement;
  897:       tie (my %properties, 'Tie::IxHash', ());
  898:       if (!defined $color && !defined $size && defined $face && lc($face) eq 'symbol') {
  899:         $replacement = $doc->createDocumentFragment();
  900:       } else {
  901:         $replacement = $doc->createElement('span');
  902:         my $css = '';
  903:         if (defined $color) {
  904:           $color =~ s/^x/#/;
  905:           $properties{'color'} = $color;
  906:         }
  907:         if (defined $size) {
  908:           my %hash = (
  909:             '1' => 'x-small',
  910:             '2' => 'small',
  911:             '3' => 'medium',
  912:             '4' => 'large',
  913:             '5' => 'x-large',
  914:             '6' => 'xx-large',
  915:             '7' => '300%',
  916:             '-1' => 'small',
  917:             '-2' => 'x-small',
  918:             '+1' => 'large',
  919:             '+2' => 'x-large',
  920:             '+3' => 'xx-large',
  921:             '+4' => '300%',
  922:           );
  923:           my $value = $hash{$size};
  924:           if (!defined $value) {
  925:             $value = 'medium';
  926:           }
  927:           $properties{'font-size'} = $value;
  928:         }
  929:         if (defined $face) {
  930:           if (lc($face) ne 'symbol' && lc($face) ne 'bold') {
  931:             $properties{'font-family'} = $face;
  932:           }
  933:         }
  934:         set_css_properties($replacement, \%properties);
  935:       }
  936:       if (defined $face && lc($face) eq 'symbol') {
  937:         # convert all content to unicode
  938:         my $next;
  939:         for (my $child=$font->firstChild; defined $child; $child=$next) {
  940:           $next = $child->nextSibling;
  941:           if ($child->nodeType == XML_TEXT_NODE) {
  942:             my $value = $child->nodeValue;
  943:             $value =~ tr/ABGDEZHQIKLMNXOPRSTUFCYWabgdezhqiklmnxoprVstufcywJjv¡«¬®/ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϕϖϒ↔←→/;
  944:             $child->setData($value);
  945:           }
  946:         }
  947:       }
  948:       # replace the font node
  949:       if ($replacement->nodeType == XML_ELEMENT_NODE && !defined $font->previousSibling &&
  950:           !defined $font->nextSibling && string_in_array(\@accepting_style, $font->parentNode->nodeName)) {
  951:         # use CSS on the parent block and replace font by its children instead of using a new element
  952:         set_css_properties($font->parentNode, \%properties);
  953:         replace_by_children($font);
  954:       } else {
  955:         # move all font children inside the replacement (span or fragment)
  956:         my $next;
  957:         for (my $child=$font->firstChild; defined $child; $child=$next) {
  958:           $next = $child->nextSibling;
  959:           $font->removeChild($child);
  960:           $replacement->appendChild($child);
  961:         }
  962:         # replace font
  963:         $font->parentNode->replaceChild($replacement, $font);
  964:       }
  965:     }
  966:   }
  967:   $root->normalize();
  968: }
  969: 
  970: # replaces u by <span style="text-decoration: underline">
  971: sub replace_u {
  972:   my ($root) = @_;
  973:   my $doc = $root->ownerDocument;
  974:   my @us = $root->getElementsByTagName('u');
  975:   foreach my $u (@us) {
  976:     my $span = $doc->createElement('span');
  977:     $span->setAttribute('style', 'text-decoration: underline');
  978:     my $next;
  979:     for (my $child=$u->firstChild; defined $child; $child=$next) {
  980:       $next = $child->nextSibling;
  981:       $u->removeChild($child);
  982:       $span->appendChild($child);
  983:     }
  984:     $u->parentNode->replaceChild($span, $u);
  985:   }
  986: }
  987: 
  988: # removes CDATA sections tags that have not been parsed correcty by the HTML parser
  989: # also removes bad comments in script elements
  990: sub remove_bad_cdata_sections {
  991:   my ($root) = @_;
  992:   my $doc = $root->ownerDocument;
  993:   foreach my $name (@preserve_elements) {
  994:     my @nodes = $root->getElementsByTagName($name);
  995:     foreach my $node (@nodes) {
  996:       if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
  997:         my $value = $node->firstChild->nodeValue;
  998:         if ($name eq 'script' && (!defined $node->getAttribute('type') || $node->getAttribute('type') ne 'loncapa/perl') &&
  999:             !defined $node->firstChild->nextSibling && $value =~ /^(\s*)<!--(.*)-->(\s*)$/) {
 1000:           # web browsers interpret that as a real comment when it is on 1 line, but the Perl HTML parser thinks it is the script
 1001:           # -> turning it back into a comment
 1002:           # (this is only true for Javascript script elements, since LON-CAPA does not parse loncapa/perl scripts in the same way)
 1003:           $node->removeChild($node->firstChild);
 1004:           $node->appendChild($doc->createComment($2));
 1005:           next;
 1006:         }
 1007:         # at the beginning:
 1008:         $value =~ s/^(\s*)<!\[CDATA\[/$1/; # <![CDATA[
 1009:         $value =~ s/^(\s*)\/\*\s*<!\[CDATA\[\s*\*\//$1/; # /* <![CDATA[ */
 1010:         $value =~ s/^(\s*)\/\/\s*<!\[CDATA\[/$1/; # // <![CDATA[
 1011:         $value =~ s/^(\s*)(\/\/)?\s*<!--/$1/; # // <!--
 1012:         # at the end:
 1013:         $value =~ s/\/\/\s*\]\]>(\s*)$/$1/; # // ]]>
 1014:         $value =~ s/\]\]>(\s*)$/$1/; # ]]>
 1015:         $value =~ s/(\/\/)?\s*-->(\s*)$/$2/; # // -->
 1016:         $value =~ s/\/\*\s*\]\]>\s*\*\/(\s*)$/$1/; # /* ]]> */
 1017:         
 1018:         $value = "\n".$value."\n";
 1019:         $value =~ s/\s*(\n[ \t]*)/$1/;
 1020:         $value =~ s/\s+$/\n/;
 1021:         $node->firstChild->setData($value);
 1022:       }
 1023:     }
 1024:   }
 1025: }
 1026: 
 1027: # adds CDATA sections to scripts
 1028: sub add_cdata_sections {
 1029:   my ($root) = @_;
 1030:   my $doc = $root->ownerDocument;
 1031:   my @scripts = $root->getElementsByTagName('script');
 1032:   my @answers = $root->getElementsByTagName('answer');
 1033:   foreach my $answer (@answers) {
 1034:     my $ancestor = $answer->parentNode;
 1035:     my $found_capa_response = 0;
 1036:     while (defined $ancestor) {
 1037:       if ($ancestor->nodeName eq 'numericalresponse' || $ancestor->nodeName eq 'formularesponse') {
 1038:         $found_capa_response = 1;
 1039:         last;
 1040:       }
 1041:       $ancestor = $ancestor->parentNode;
 1042:     }
 1043:     if (!$found_capa_response) {
 1044:       push(@scripts, $answer);
 1045:     }
 1046:   }
 1047:   foreach my $script (@scripts) {
 1048:     # use a CDATA section in the normal situation, for any script
 1049:     my $first = $script->firstChild;
 1050:     if (defined $first && $first->nodeType == XML_TEXT_NODE && !defined $first->nextSibling) {
 1051:       my $cdata = $doc->createCDATASection($first->nodeValue);
 1052:       $script->replaceChild($cdata, $first);
 1053:     }
 1054:   }
 1055: }
 1056: 
 1057: # removes "<!--" and "-->" at the beginning and end of style elements
 1058: sub fix_style_element {
 1059:   my ($root) = @_;
 1060:   my @styles = $root->getElementsByTagName('style');
 1061:   foreach my $style (@styles) {
 1062:     if (defined $style->firstChild && $style->firstChild->nodeType == XML_TEXT_NODE &&
 1063:         !defined $style->firstChild->nextSibling) {
 1064:       my $text = $style->firstChild->nodeValue;
 1065:       if ($text =~ /^\s*<!--(.*)-->\s*$/s) {
 1066:         $style->firstChild->setData($1);
 1067:       }
 1068:     }
 1069:   }
 1070: }
 1071: 
 1072: # create missing cells at the end of table rows
 1073: sub fix_tables {
 1074:   my ($root) = @_;
 1075:   my @tables = $root->getElementsByTagName('table');
 1076:   foreach my $table (@tables) {
 1077:     fix_cells($table);
 1078:     foreach my $tbody ($table->getChildrenByTagName('tbody')) {
 1079:       fix_cells($tbody);
 1080:     }
 1081:     foreach my $thead ($table->getChildrenByTagName('thead')) {
 1082:       fix_cells($thead);
 1083:     }
 1084:     foreach my $tfoot ($table->getChildrenByTagName('tfoot')) {
 1085:       fix_cells($tfoot);
 1086:     }
 1087:   }
 1088: }
 1089: 
 1090: # create missing cells at the end of table rows
 1091: sub fix_cells {
 1092:   my ($table) = @_; # could actually be table, tbody, thead or tfoot
 1093:   my $doc = $table->ownerDocument;
 1094:   my @nb_cells = ();
 1095:   my $max_nb_cells = 0;
 1096:   my @rowspans = ();
 1097:   my @trs = $table->getChildrenByTagName('tr');
 1098:   foreach my $tr (@trs) {
 1099:     my $nb_cells;
 1100:     if (defined $rowspans[0]) {
 1101:       $nb_cells = shift(@rowspans);
 1102:     } else {
 1103:       $nb_cells = 0;
 1104:     }
 1105:     for (my $cell=$tr->firstChild; defined $cell; $cell=$cell->nextSibling) {
 1106:       if ($cell->nodeName eq 'td' || $cell->nodeName eq 'th') {
 1107:         my $colspan = $cell->getAttribute('colspan');
 1108:         if (defined $colspan && $colspan =~ /^\s*[0-9]+\s*$/) {
 1109:           $nb_cells += $colspan;
 1110:         } else {
 1111:           $nb_cells++;
 1112:         }
 1113:         my $rowspan = $cell->getAttribute('rowspan');
 1114:         if (defined $rowspan && $rowspan =~ /^\s*[0-9]+\s*$/) {
 1115:           for (my $i=0; $i < $rowspan-1; $i++) {
 1116:             if (!defined $rowspans[$i]) {
 1117:               $rowspans[$i] = 1;
 1118:             } else {
 1119:               $rowspans[$i]++;
 1120:             }
 1121:           }
 1122:         }
 1123:       }
 1124:     }
 1125:     push(@nb_cells, $nb_cells);
 1126:     if ($nb_cells > $max_nb_cells) {
 1127:       $max_nb_cells = $nb_cells;
 1128:     }
 1129:   }
 1130:   foreach my $tr (@trs) {
 1131:     my $nb_cells = shift(@nb_cells);
 1132:     if ($nb_cells < $max_nb_cells) {
 1133:       for (1..($max_nb_cells - $nb_cells)) {
 1134:         $tr->appendChild($doc->createElement('td'));
 1135:       }
 1136:     }
 1137:   }
 1138: }
 1139: 
 1140: # replaces ul/ul by ul/li/ul and the same for ol (using the previous li if possible)
 1141: # also adds a ul element when a li has no ul/ol ancestor
 1142: sub fix_lists {
 1143:   my ($root) = @_;
 1144:   my $doc = $root->ownerDocument;
 1145:   my @uls = $root->getElementsByTagName('ul');
 1146:   my @ols = $root->getElementsByTagName('ol');
 1147:   my @lists = (@uls, @ols);
 1148:   foreach my $list (@lists) {
 1149:     my $next;
 1150:     for (my $child=$list->firstChild; defined $child; $child=$next) {
 1151:       $next = $child->nextSibling;
 1152:       if ($child->nodeType == XML_ELEMENT_NODE && string_in_array(['ul','ol'], $child->nodeName)) {
 1153:         my $previous = $child->previousNonBlankSibling(); # note: non-DOM method
 1154:         $list->removeChild($child);
 1155:         if (defined $previous && $previous->nodeType == XML_ELEMENT_NODE && $previous->nodeName eq 'li') {
 1156:           $previous->appendChild($child);
 1157:         } else {
 1158:           my $li = $doc->createElement('li');
 1159:           $li->appendChild($child);
 1160:           if (!defined $next) {
 1161:             $list->appendChild($li);
 1162:           } else {
 1163:             $list->insertBefore($li, $next);
 1164:           }
 1165:         }
 1166:       }
 1167:     }
 1168:   }
 1169:   my @lis = $root->getElementsByTagName('li');
 1170:   foreach my $li (@lis) {
 1171:     my $found_list_ancestor = 0;
 1172:     my $ancestor = $li->parentNode;
 1173:     while (defined $ancestor) {
 1174:       if ($ancestor->nodeName eq 'ul' || $ancestor->nodeName eq 'ol') {
 1175:         $found_list_ancestor = 1;
 1176:         last;
 1177:       }
 1178:       $ancestor = $ancestor->parentNode;
 1179:     }
 1180:     if (!$found_list_ancestor) {
 1181:       # replace li by ul and add li under ul
 1182:       my $ul = $doc->createElement('ul');
 1183:       $li->parentNode->insertBefore($ul, $li);
 1184:       $li->parentNode->removeChild($li);
 1185:       $ul->appendChild($li);
 1186:       # add all other li afterwards inside ul (there might be text nodes in-between)
 1187:       my $next = $ul->nextSibling;
 1188:       while (defined $next) {
 1189:         my $next_next = $next->nextSibling;
 1190:         if ($next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^\s*$/ &&
 1191:             defined $next_next && $next_next->nodeType == XML_ELEMENT_NODE && $next_next->nodeName eq 'li') {
 1192:           $next->parentNode->removeChild($next);
 1193:           $ul->appendChild($next);
 1194:           $next = $next_next;
 1195:           $next_next = $next_next->nextSibling;
 1196:         }
 1197:         if ($next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'li') {
 1198:           $next->parentNode->removeChild($next);
 1199:           $ul->appendChild($next);
 1200:         } else {
 1201:           last;
 1202:         }
 1203:         $next = $next_next;
 1204:       }
 1205:     }
 1206:   }
 1207: }
 1208: 
 1209: # Some "image" elements are actually img element with a wrong name. This renames them.
 1210: # Amazingly enough, "<image src=..." displays an image in some browsers
 1211: # ("image" has existed at some point as an experimental HTML element).
 1212: sub fix_wrong_name_for_img {
 1213:   my ($root) = @_;
 1214:   my @images = $root->getElementsByTagName('image');
 1215:   foreach my $image (@images) {
 1216:     if (!defined $image->getAttribute('src')) {
 1217:       next;
 1218:     }
 1219:     my $found_correct_ancestor = 0;
 1220:     my $ancestor = $image->parentNode;
 1221:     while (defined $ancestor) {
 1222:       if ($ancestor->nodeName eq 'drawimage' || $ancestor->nodeName eq 'imageresponse') {
 1223:         $found_correct_ancestor = 1;
 1224:         last;
 1225:       }
 1226:       $ancestor = $ancestor->parentNode;
 1227:     }
 1228:     if ($found_correct_ancestor) {
 1229:       next;
 1230:     }
 1231:     # this really has to be renamed "img"
 1232:     $image->setNodeName('img');
 1233:   }
 1234: }
 1235: 
 1236: # Replaces many deprecated attributes and replaces them by equivalent CSS when possible
 1237: sub replace_deprecated_attributes_by_css {
 1238:   my ($root) = @_;
 1239:   
 1240:   fix_deprecated_in_tables($root);
 1241:   
 1242:   fix_deprecated_in_table_rows($root);
 1243:   
 1244:   fix_deprecated_in_table_cells($root);
 1245:   
 1246:   fix_deprecated_in_lists($root);
 1247:   
 1248:   fix_deprecated_in_list_items($root);
 1249:   
 1250:   fix_deprecated_in_hr($root);
 1251:   
 1252:   fix_deprecated_in_img($root);
 1253:   
 1254:   fix_deprecated_in_body($root);
 1255:   
 1256:   fix_align_attribute($root);
 1257: }
 1258: 
 1259: # Replaces deprecated attributes in tables
 1260: sub fix_deprecated_in_tables {
 1261:   my ($root) = @_;
 1262:   my @tables = $root->getElementsByTagName('table');
 1263:   foreach my $table (@tables) {
 1264:     tie (my %new_properties, 'Tie::IxHash', ());
 1265:     my $align = $table->getAttribute('align');
 1266:     if (defined $align) {
 1267:       $table->removeAttribute('align');
 1268:       $align = lc(trim($align));
 1269:     }
 1270:     if ($table->parentNode->nodeName eq 'center' || (defined $align && $align eq 'center') ||
 1271:         (defined $table->parentNode->getAttribute('align') && $table->parentNode->getAttribute('align') eq 'center')) {
 1272:       $new_properties{'margin-left'} = 'auto';
 1273:       $new_properties{'margin-right'} = 'auto';
 1274:     }
 1275:     if (defined $align && ($align eq 'left' || $align eq 'right')) {
 1276:       $new_properties{'float'} = $align;
 1277:     }
 1278:     my $width = $table->getAttribute('width');
 1279:     if (defined $width) {
 1280:       $table->removeAttribute('width');
 1281:       $width = trim($width);
 1282:       if ($width =~ /^[0-9]+$/) {
 1283:         $width .= 'px';
 1284:       }
 1285:       if ($width ne '') {
 1286:         $new_properties{'width'} = $width;
 1287:       }
 1288:     }
 1289:     my $height = $table->getAttribute('height');
 1290:     if (defined $height) {
 1291:       $table->removeAttribute('height');
 1292:       # no replacement for table height
 1293:     }
 1294:     my $bgcolor = $table->getAttribute('bgcolor');
 1295:     if (defined $bgcolor) {
 1296:       $table->removeAttribute('bgcolor');
 1297:       $bgcolor = trim($bgcolor);
 1298:       $bgcolor =~ s/^x\s*//;
 1299:       if ($bgcolor ne '') {
 1300:         $new_properties{'background-color'} = $bgcolor;
 1301:       }
 1302:     }
 1303:     my $frame = $table->getAttribute('frame');
 1304:     if (defined $frame) {
 1305:       $table->removeAttribute('frame');
 1306:       $frame = lc(trim($frame));
 1307:       if ($frame eq 'void') {
 1308:         $new_properties{'border'} = 'none';
 1309:       } elsif ($frame eq 'above') {
 1310:         $new_properties{'border-top'} = '1px solid black';
 1311:       } elsif ($frame eq 'below') {
 1312:         $new_properties{'border-bottom'} = '1px solid black';
 1313:       } elsif ($frame eq 'hsides') {
 1314:         $new_properties{'border-top'} = '1px solid black';
 1315:         $new_properties{'border-bottom'} = '1px solid black';
 1316:       } elsif ($frame eq 'vsides') {
 1317:         $new_properties{'border-left'} = '1px solid black';
 1318:         $new_properties{'border-right'} = '1px solid black';
 1319:       } elsif ($frame eq 'lhs') {
 1320:         $new_properties{'border-left'} = '1px solid black';
 1321:       } elsif ($frame eq 'rhs') {
 1322:         $new_properties{'border-right'} = '1px solid black';
 1323:       } elsif ($frame eq 'box') {
 1324:         $new_properties{'border'} = '1px solid black';
 1325:       } elsif ($frame eq 'border') {
 1326:         $new_properties{'border'} = '1px solid black';
 1327:       }
 1328:     }
 1329:     if (scalar(keys %new_properties) > 0) {
 1330:       set_css_properties($table, \%new_properties);
 1331:     }
 1332:     # we can't replace the border attribute without creating a style block, but we can improve things like border="BORDER"
 1333:     my $border = $table->getAttribute('border');
 1334:     if (defined $border) {
 1335:       $border = trim($border);
 1336:       if ($border !~ /^\s*[0-9]+\s*(px)?\s*$/) {
 1337:         $table->setAttribute('border', '1');
 1338:       }
 1339:     }
 1340:   }
 1341:   
 1342: }
 1343: 
 1344: # Replaces deprecated attributes in tr elements
 1345: sub fix_deprecated_in_table_rows {
 1346:   my ($root) = @_;
 1347:   my @trs = $root->getElementsByTagName('tr');
 1348:   foreach my $tr (@trs) {
 1349:     my $old_properties = get_css_properties($tr);
 1350:     tie (my %new_properties, 'Tie::IxHash', ());
 1351:     my $bgcolor = $tr->getAttribute('bgcolor');
 1352:     if (defined $bgcolor) {
 1353:       $tr->removeAttribute('bgcolor');
 1354:       if (!defined $old_properties->{'background-color'}) {
 1355:         $bgcolor = trim($bgcolor);
 1356:         $bgcolor =~ s/^x\s*//;
 1357:         if ($bgcolor ne '') {
 1358:           $new_properties{'background-color'} = $bgcolor;
 1359:         }
 1360:       }
 1361:     }
 1362:     my $align = $tr->getAttribute('align');
 1363:     if (defined $align && $align !~ /\s*char\s*/i) {
 1364:       $tr->removeAttribute('align');
 1365:       if (!defined $old_properties->{'text-align'}) {
 1366:         $align = lc(trim($align));
 1367:         if ($align ne '') {
 1368:           $new_properties{'text-align'} = $align;
 1369:         }
 1370:       }
 1371:     }
 1372:     my $valign = $tr->getAttribute('valign');
 1373:     if (defined $valign) {
 1374:       $tr->removeAttribute('valign');
 1375:       if (!defined $old_properties->{'vertical-align'}) {
 1376:         $valign = lc(trim($valign));
 1377:         if ($valign ne '') {
 1378:           $new_properties{'vertical-align'} = $valign;
 1379:         }
 1380:       }
 1381:     }
 1382:     if (scalar(keys %new_properties) > 0) {
 1383:       set_css_properties($tr, \%new_properties);
 1384:     }
 1385:   }
 1386: }
 1387: 
 1388: # Replaces deprecated attributes in table cells (td and th)
 1389: sub fix_deprecated_in_table_cells {
 1390:   my ($root) = @_;
 1391:   my @tds = $root->getElementsByTagName('td');
 1392:   my @ths = $root->getElementsByTagName('th');
 1393:   my @cells = (@tds, @ths);
 1394:   foreach my $cell (@cells) {
 1395:     my $old_properties = get_css_properties($cell);
 1396:     tie (my %new_properties, 'Tie::IxHash', ());
 1397:     my $width = $cell->getAttribute('width');
 1398:     if (defined $width) {
 1399:       $cell->removeAttribute('width');
 1400:       if (!defined $old_properties->{'width'}) {
 1401:         $width = trim($width);
 1402:         if ($width =~ /^[0-9]+$/) {
 1403:           $width .= 'px';
 1404:         }
 1405:         if ($width ne '') {
 1406:           $new_properties{'width'} = $width;
 1407:         }
 1408:       }
 1409:     }
 1410:     my $height = $cell->getAttribute('height');
 1411:     if (defined $height) {
 1412:       $cell->removeAttribute('height');
 1413:       if (!defined $old_properties->{'height'}) {
 1414:         $height = trim($height);
 1415:         if ($height =~ /^[0-9]+$/) {
 1416:           $height .= 'px';
 1417:         }
 1418:         if ($height ne '') {
 1419:           $new_properties{'height'} = $height;
 1420:         }
 1421:       }
 1422:     }
 1423:     my $bgcolor = $cell->getAttribute('bgcolor');
 1424:     if (defined $bgcolor) {
 1425:       $cell->removeAttribute('bgcolor');
 1426:       if (!defined $old_properties->{'background-color'}) {
 1427:         $bgcolor = trim($bgcolor);
 1428:         $bgcolor =~ s/^x\s*//;
 1429:         if ($bgcolor ne '') {
 1430:           $new_properties{'background-color'} = $bgcolor;
 1431:         }
 1432:       }
 1433:     }
 1434:     my $align = $cell->getAttribute('align');
 1435:     if (defined $align && $align !~ /\s*char\s*/i) {
 1436:       $cell->removeAttribute('align');
 1437:       if (!defined $old_properties->{'text-align'}) {
 1438:         $align = lc(trim($align));
 1439:         if ($align ne '') {
 1440:           $new_properties{'text-align'} = $align;
 1441:         }
 1442:       }
 1443:     }
 1444:     my $valign = $cell->getAttribute('valign');
 1445:     if (defined $valign) {
 1446:       $cell->removeAttribute('valign');
 1447:       if (!defined $old_properties->{'vertical-align'}) {
 1448:         $valign = lc(trim($valign));
 1449:         if ($valign ne '') {
 1450:           $new_properties{'vertical-align'} = $valign;
 1451:         }
 1452:       }
 1453:     }
 1454:     if (scalar(keys %new_properties) > 0) {
 1455:       set_css_properties($cell, \%new_properties);
 1456:     }
 1457:   }
 1458: }
 1459: 
 1460: # Replaces deprecated attributes in lists (ul and ol)
 1461: sub fix_deprecated_in_lists {
 1462:   my ($root) = @_;
 1463:   my @uls = $root->getElementsByTagName('ul');
 1464:   my @ols = $root->getElementsByTagName('ol');
 1465:   my @lists = (@uls, @ols);
 1466:   foreach my $list (@lists) {
 1467:     my $type = $list->getAttribute('type');
 1468:     if (defined $type) {
 1469:       my $lst = list_style_type($type);
 1470:       if (defined $lst) {
 1471:         $list->removeAttribute('type');
 1472:         if (!defined get_css_property($list, 'list-style-type')) {
 1473:           set_css_property($list, 'list-style-type', $lst);
 1474:         }
 1475:       }
 1476:     }
 1477:   }
 1478: }
 1479: 
 1480: # Replaces deprecated attributes in list items (li)
 1481: sub fix_deprecated_in_list_items {
 1482:   my ($root) = @_;
 1483:   my @lis = $root->getElementsByTagName('li');
 1484:   foreach my $li (@lis) {
 1485:     my $type = $li->getAttribute('type');
 1486:     if (defined $type) {
 1487:       my $lst = list_style_type($type);
 1488:       if (defined $lst) {
 1489:         $li->removeAttribute('type');
 1490:         if (!defined get_css_property($li, 'list-style-type')) {
 1491:           set_css_property($li, 'list-style-type', $lst);
 1492:         }
 1493:       }
 1494:     }
 1495:   }
 1496: }
 1497: 
 1498: # returns the CSS list-style-type value equivalent to the given type attribute for a list or list item
 1499: sub list_style_type {
 1500:   my ($type) = @_;
 1501:   my $value;
 1502:   $type = trim($type);
 1503:   if (lc($type) eq 'circle') {
 1504:     $value = 'circle';
 1505:   } elsif (lc($type) eq 'disc') {
 1506:     $value = 'disc';
 1507:   } elsif (lc($type) eq 'square') {
 1508:     $value = 'square';
 1509:   } elsif ($type eq 'a') {
 1510:     $value = 'lower-latin';
 1511:   } elsif ($type eq 'A') {
 1512:     $value = 'upper-latin';
 1513:   } elsif ($type eq 'i') {
 1514:     $value = 'lower-roman';
 1515:   } elsif ($type eq 'I') {
 1516:     $value = 'upper-roman';
 1517:   } elsif ($type eq '1') {
 1518:     $value = 'decimal';
 1519:   }
 1520:   return $value;
 1521: }
 1522: 
 1523: # Replaces deprecated attributes in hr
 1524: sub fix_deprecated_in_hr {
 1525:   my ($root) = @_;
 1526:   my @hrs = $root->getElementsByTagName('hr');
 1527:   foreach my $hr (@hrs) {
 1528:     tie (my %new_properties, 'Tie::IxHash', ());
 1529:     my $align = $hr->getAttribute('align');
 1530:     if (defined $align) {
 1531:       $align = lc(trim($align));
 1532:       if ($align eq 'left') {
 1533:         $new_properties{'text-align'} = 'left';
 1534:         $new_properties{'margin-left'} = '0';
 1535:       } elsif ($align eq 'right') {
 1536:         $new_properties{'text-align'} = 'right';
 1537:         $new_properties{'margin-right'} = '0';
 1538:       }
 1539:       $hr->removeAttribute('align');
 1540:     }
 1541:     my $color = $hr->getAttribute('color');
 1542:     if (defined $color) {
 1543:       $color = trim($color);
 1544:       $color =~ s/^x\s*//;
 1545:       if ($color ne '') {
 1546:         $new_properties{'color'} = $color;
 1547:         $new_properties{'background-color'} = $color;
 1548:       }
 1549:       $hr->removeAttribute('color');
 1550:     }
 1551:     my $noshade = $hr->getAttribute('noshade');
 1552:     my $size = $hr->getAttribute('size');
 1553:     if (defined $noshade) {
 1554:       $new_properties{'border-width'} = '0';
 1555:       if (!defined $color) {
 1556:         $new_properties{'color'} = 'gray';
 1557:         $new_properties{'background-color'} = 'gray';
 1558:       }
 1559:       if (!defined $size) {
 1560:         $size = '2';
 1561:       }
 1562:       $hr->removeAttribute('noshade');
 1563:     }
 1564:     if (defined $size) {
 1565:       $size = trim($size);
 1566:       if ($size ne '') {
 1567:         $new_properties{'height'} = $size.'px';
 1568:       }
 1569:       if (defined $hr->getAttribute('size')) {
 1570:         $hr->removeAttribute('size');
 1571:       }
 1572:     }
 1573:     my $width = $hr->getAttribute('width');
 1574:     if (defined $width) {
 1575:       $width = trim($width);
 1576:       if ($width ne '') {
 1577:         if ($width !~ /\%$/) {
 1578:           $width .= 'px';
 1579:         }
 1580:         $new_properties{'width'} = $width;
 1581:       }
 1582:       $hr->removeAttribute('width');
 1583:     }
 1584:     if (scalar(keys %new_properties) > 0) {
 1585:       set_css_properties($hr, \%new_properties);
 1586:     }
 1587:   }
 1588: }
 1589: 
 1590: # Replaces deprecated attributes in img
 1591: sub fix_deprecated_in_img {
 1592:   my ($root) = @_;
 1593:   my @imgs = $root->getElementsByTagName('img');
 1594:   foreach my $img (@imgs) {
 1595:     my $old_properties = get_css_properties($img);
 1596:     tie (my %new_properties, 'Tie::IxHash', ());
 1597:     my $align = $img->getAttribute('align');
 1598:     if (defined $align) {
 1599:       $align = lc(trim($align));
 1600:       if ($align eq 'middle' || $align eq 'top' || $align eq 'bottom') {
 1601:         $img->removeAttribute('align');
 1602:         if (!defined $old_properties->{'vertical-align'}) {
 1603:           $new_properties{'vertical-align'} = $align;
 1604:         }
 1605:       } elsif ($align eq 'left' || $align eq 'right') {
 1606:         $img->removeAttribute('align');
 1607:         if (!defined $old_properties->{'float'}) {
 1608:           $new_properties{'float'} = $align;
 1609:         }
 1610:       } elsif ($align eq 'center' || $align eq '') {
 1611:         $img->removeAttribute('align');
 1612:       }
 1613:     }
 1614:     my $border = $img->getAttribute('border');
 1615:     if (defined $border) {
 1616:       $border = lc(trim($border));
 1617:       if ($border =~ /^[0-9]+\s*(px)?$/) {
 1618:         $img->removeAttribute('border');
 1619:         if (!defined $old_properties->{'border'}) {
 1620:           if ($border !~ /px$/) {
 1621:             $border .= 'px';
 1622:           }
 1623:           $new_properties{'border'} = $border.' solid black';
 1624:         }
 1625:       }
 1626:     }
 1627:     my $hspace = $img->getAttribute('hspace');
 1628:     if (defined $hspace) {
 1629:       $hspace = lc(trim($hspace));
 1630:       if ($hspace =~ /^[0-9]+\s*(px)?$/) {
 1631:         $img->removeAttribute('hspace');
 1632:         if (!defined $old_properties->{'margin-left'} || !defined $old_properties->{'margin-right'}) {
 1633:           if ($hspace !~ /px$/) {
 1634:             $hspace .= 'px';
 1635:           }
 1636:           $new_properties{'margin-left'} = $hspace;
 1637:           $new_properties{'margin-right'} = $hspace;
 1638:         }
 1639:       }
 1640:     }
 1641:     if (scalar(keys %new_properties) > 0) {
 1642:       set_css_properties($img, \%new_properties);
 1643:     }
 1644:   }
 1645: }
 1646: 
 1647: # Replaces deprecated attributes in htmlbody (the style attribute could be used in a div for output)
 1648: sub fix_deprecated_in_body {
 1649:   my ($root) = @_;
 1650:   my $doc = $root->ownerDocument;
 1651:   my @bodies = $root->getElementsByTagName('htmlbody');
 1652:   foreach my $body (@bodies) {
 1653:     my $old_properties = get_css_properties($body);
 1654:     tie (my %new_properties, 'Tie::IxHash', ());
 1655:     my $bgcolor = $body->getAttribute('bgcolor');
 1656:     if (defined $bgcolor) {
 1657:       $body->removeAttribute('bgcolor');
 1658:       if (!defined $old_properties->{'background-color'}) {
 1659:         $bgcolor = trim($bgcolor);
 1660:         $bgcolor =~ s/^x\s*//;
 1661:         if ($bgcolor ne '') {
 1662:           $new_properties{'background-color'} = $bgcolor;
 1663:         }
 1664:       }
 1665:     }
 1666:     my $color = $body->getAttribute('text');
 1667:     if (defined $color) {
 1668:       $body->removeAttribute('text');
 1669:       if (!defined $old_properties->{'color'}) {
 1670:         $color = trim($color);
 1671:         $color =~ s/^x\s*//;
 1672:         if ($color ne '') {
 1673:           $new_properties{'color'} = $color;
 1674:         }
 1675:       }
 1676:     }
 1677:     my $background = $body->getAttribute('background');
 1678:     if (defined $background && ($background =~ /\.jpe?g$|\.gif|\.png/i)) {
 1679:       $body->removeAttribute('background');
 1680:       if (!defined $old_properties->{'background-image'}) {
 1681:         $background = trim($background);
 1682:         if ($background ne '') {
 1683:           $new_properties{'background-image'} = 'url('.$background.')';
 1684:         }
 1685:       }
 1686:     }
 1687:     # NOTE: these attributes have never been standard and are better removed with no replacement
 1688:     foreach my $bad ('bottommargin', 'leftmargin', 'rightmargin', 'topmargin', 'marginheight', 'marginwidth') {
 1689:       if ($body->hasAttribute($bad)) {
 1690:         $body->removeAttribute($bad);
 1691:       }
 1692:     }
 1693:     # NOTE: link alink and vlink require a <style> block to be converted
 1694:     my $link = $body->getAttribute('link');
 1695:     my $alink = $body->getAttribute('alink');
 1696:     my $vlink = $body->getAttribute('vlink');
 1697:     if (defined $link || defined $alink || defined $vlink) {
 1698:       my $head;
 1699:       my @heads = $root->getElementsByTagName('htmlhead');
 1700:       if (scalar(@heads) > 0) {
 1701:         $head = $heads[0];
 1702:       } else {
 1703:         $head = $doc->createElement('htmlhead');
 1704:         $root->insertBefore($head, $root->firstChild);
 1705:       }
 1706:       my $style = $doc->createElement('style');
 1707:       $head->appendChild($style);
 1708:       my $css = "\n";
 1709:       if (defined $link) {
 1710:         $body->removeAttribute('link');
 1711:         $link = trim($link);
 1712:         $link =~ s/^x\s*//;
 1713:         $css .= '      a:link { color:'.$link.' }';
 1714:         $css .= "\n";
 1715:       }
 1716:       if (defined $alink) {
 1717:         $body->removeAttribute('alink');
 1718:         $alink = trim($alink);
 1719:         $alink =~ s/^x\s*//;
 1720:         $css .= '      a:active { color:'.$alink.' }';
 1721:         $css .= "\n";
 1722:       }
 1723:       if (defined $vlink) {
 1724:         $body->removeAttribute('vlink');
 1725:         $vlink = trim($vlink);
 1726:         $vlink =~ s/^x\s*//;
 1727:         $css .= '      a:visited { color:'.$vlink.' }';
 1728:         $css .= "\n";
 1729:       }
 1730:       $css .= '    ';
 1731:       $style->appendChild($doc->createTextNode($css));
 1732:     }
 1733:     if (scalar(keys %new_properties) > 0) {
 1734:       set_css_properties($body, \%new_properties);
 1735:     } elsif (!$body->hasAttributes) {
 1736:       $body->parentNode->removeChild($body);
 1737:     }
 1738:   }
 1739: }
 1740: 
 1741: # replaces <div align="center"> by <div style="text-align:center;">
 1742: # also for p and h1..h6
 1743: sub fix_align_attribute {
 1744:   my ($root) = @_;
 1745:   my @nodes = $root->getElementsByTagName('div');
 1746:   push(@nodes, $root->getElementsByTagName('p'));
 1747:   for (my $i=1; $i<=6; $i++) {
 1748:     push(@nodes, $root->getElementsByTagName('h'.$i));
 1749:   }
 1750:   foreach my $node (@nodes) {
 1751:     my $align = $node->getAttribute('align');
 1752:     if (defined $align) {
 1753:       $node->removeAttribute('align');
 1754:       $align = trim($align);
 1755:       if ($align ne '' && !defined get_css_property($node, 'text-align')) {
 1756:         set_css_property($node, 'text-align', lc($align));
 1757:       }
 1758:     }
 1759:   }
 1760: }
 1761: 
 1762: # replace center by a div or remove it if there is a table inside
 1763: sub replace_center {
 1764:   my ($root, $all_block) = @_;
 1765:   my $doc = $root->ownerDocument;
 1766:   my @centers = $root->getElementsByTagName('center');
 1767:   foreach my $center (@centers) {
 1768:     if ($center->getChildrenByTagName('table')->size() > 0) { # note: getChildrenByTagName is not DOM (LibXML specific)
 1769:       replace_by_children($center);
 1770:     } else {
 1771:       if ((!defined $center->previousSibling ||
 1772:           ($center->previousSibling->nodeType == XML_TEXT_NODE && $center->previousSibling->nodeValue =~ /^\s*$/ && !defined $center->previousSibling->previousSibling)) &&
 1773:           (!defined $center->nextSibling ||
 1774:           ($center->nextSibling->nodeType == XML_TEXT_NODE && $center->nextSibling->nodeValue =~ /^\s*$/ && !defined $center->nextSibling->nextSibling)) &&
 1775:           string_in_array(\@accepting_style, $center->parentNode->nodeName)) {
 1776:         # use CSS on the parent block and replace center by its children
 1777:         set_css_property($center->parentNode, 'text-align', 'center');
 1778:         replace_by_children($center);
 1779:       } else {
 1780:         # use p or div ? check if there is a block inside
 1781:         my $found_block = 0;
 1782:         for (my $child=$center->firstChild; defined $child; $child=$child->nextSibling) {
 1783:           if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
 1784:             $found_block = 1;
 1785:             last;
 1786:           }
 1787:         }
 1788:         my $new_node;
 1789:         if ($found_block) {
 1790:           $new_node = $doc->createElement('div');
 1791:           $new_node->setAttribute('style', 'text-align: center; margin: 0 auto');
 1792:         } else {
 1793:           $new_node = $doc->createElement('p');
 1794:           $new_node->setAttribute('style', 'text-align: center');
 1795:         }
 1796:         my $next;
 1797:         for (my $child=$center->firstChild; defined $child; $child=$next) {
 1798:           $next = $child->nextSibling;
 1799:           $center->removeChild($child);
 1800:           $new_node->appendChild($child);
 1801:         }
 1802:         $center->parentNode->replaceChild($new_node, $center);
 1803:       }
 1804:     }
 1805:   }
 1806: }
 1807: 
 1808: # replaces <nobr> by <span style="white-space:nowrap">
 1809: sub replace_nobr {
 1810:   my ($root) = @_;
 1811:   my @nobrs = $root->getElementsByTagName('nobr');
 1812:   foreach my $nobr (@nobrs) {
 1813:     if (!defined $nobr->previousSibling && !defined $nobr->nextSibling &&
 1814:         string_in_array(\@accepting_style, $nobr->parentNode->nodeName)) {
 1815:       # use CSS on the parent block
 1816:       set_css_property($nobr->parentNode, 'white-space', 'nowrap');
 1817:       replace_by_children($nobr);
 1818:     } else {
 1819:       $nobr->setNodeName('span');
 1820:       $nobr->setAttribute('style', 'white-space:nowrap');
 1821:     }
 1822:   }
 1823: }
 1824: 
 1825: # removes notsolved tags in the case <hintgroup showoncorrect="no"><notsolved>...</notsolved></hintgroup>
 1826: # and in the case <notsolved><hintgroup showoncorrect="no">...</hintgroup></notsolved>
 1827: sub remove_useless_notsolved {
 1828:   my ($root) = @_;
 1829:   my @hintgroups = $root->getElementsByTagName('hintgroup');
 1830:   foreach my $hintgroup (@hintgroups) {
 1831:     my $showoncorrect = get_non_empty_attribute($hintgroup, 'showoncorrect');
 1832:     if (!defined $showoncorrect || $showoncorrect eq 'no') {
 1833:       my @notsolveds = $hintgroup->getElementsByTagName('notsolved');
 1834:       foreach my $notsolved (@notsolveds) {
 1835:         replace_by_children($notsolved);
 1836:       }
 1837:     }
 1838:     my $parent = $hintgroup->parentNode;
 1839:     if ($parent->nodeName eq 'notsolved' && scalar(@{$parent->nonBlankChildNodes()}) == 1) {
 1840:       replace_by_children($parent);
 1841:     }
 1842:   }
 1843: }
 1844: 
 1845: # Use <pre> for multi-line comments without elements.
 1846: sub fix_comments {
 1847:   my ($root) = @_;
 1848:   my $doc = $root->ownerDocument;
 1849:   my @comments = $root->getElementsByTagName('comment');
 1850:   foreach my $comment (@comments) {
 1851:     my $first = $comment->firstChild;
 1852:     if (defined $first) {
 1853:       if ($first->nodeType == XML_TEXT_NODE && $first->nodeValue =~ /\n/ &&
 1854:           !defined $first->nextSibling) {
 1855:         my $pre = $doc->createElement('pre');
 1856:         $comment->removeChild($first);
 1857:         $comment->appendChild($pre);
 1858:         $pre->appendChild($first);
 1859:       }
 1860:     }
 1861:   }
 1862: }
 1863: 
 1864: # adds a paragraph inside if needed and calls fix_paragraph for all paragraphs (including new ones)
 1865: sub fix_paragraphs_inside {
 1866:   my ($node, $all_block) = @_;
 1867:   # blocks in which paragrahs will be added:
 1868:   my @blocks_with_p = ('loncapa','library','problem','part','problemtype','window','block','while','postanswerdate','preduedate','languageblock','instructorcomment','togglebox','standalone','body','form');
 1869:   my @fix_p_if_br_or_p = (@responses,'foil','item','text','label','hintgroup','hintpart','hint','web','windowlink','div','li','dd','td','th','blockquote','solved','notsolved');
 1870:   if ((string_in_array(\@blocks_with_p, $node->nodeName) && paragraph_needed($node)) ||
 1871:       (string_in_array(\@fix_p_if_br_or_p, $node->nodeName) && paragraph_inside($node))) {
 1872:     # if non-empty, add paragraphs where needed between all br and remove br
 1873:     # (it would be easier to just put everything in a p and fix it afterwards, but there are performance issues
 1874:     #  when a paragraph has many blocks directly inside)
 1875:     my $doc = $node->ownerDocument;
 1876:     my $p = undef;
 1877:     my @new_children = ();
 1878:     my $next;
 1879:     for (my $child=$node->firstChild; defined $child; $child=$next) {
 1880:       $next = $child->nextSibling;
 1881:       $node->removeChild($child);
 1882:       if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'br') {
 1883:         if (defined $p) {
 1884:           push(@new_children, $p);
 1885:         } else {
 1886:           push(@new_children, $doc->createElement('p'));
 1887:         }
 1888:         $p = undef;
 1889:         # ignore the next node if it is a br (the paragraph default margin will take as much space)
 1890:         # (ignoring whitespace)
 1891:         while (defined $next && $next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^[ \t\f\n\r]*$/) {
 1892:           my $next2 = $next->nextSibling;
 1893:           $node->removeChild($next);
 1894:           $next = $next2;
 1895:         }
 1896:         if (defined $next && $next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'br') {
 1897:           my $next2 = $next->nextSibling;
 1898:           $node->removeChild($next);
 1899:           $next = $next2;
 1900:         }
 1901:       } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array(\@inline_like_block, $child->nodeName)) {
 1902:         # inline_like_block: use the paragraph if there is one, otherwise do not create one
 1903:         if (defined $p) {
 1904:           $p->appendChild($child);
 1905:         } else {
 1906:           push(@new_children, $child);
 1907:         }
 1908:       } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
 1909:         # these children are blocks and should not be in a paragraph
 1910:         if (defined $p) {
 1911:           push(@new_children, $p);
 1912:           $p = undef;
 1913:         }
 1914:         push(@new_children, $child);
 1915:       } elsif ($child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^[ \t\f\n\r]*$/) {
 1916:         # blank text: add to paragraph if there is one and there is a next node, otherwise keep out of the paragraph
 1917:         if (defined $p) {
 1918:           if (defined $next) {
 1919:             $p->appendChild($child);
 1920:           } else {
 1921:             push(@new_children, $p);
 1922:             $p = undef;
 1923:             push(@new_children, $child);
 1924:           }
 1925:         } else {
 1926:           push(@new_children, $child);
 1927:         }
 1928:       } elsif ($child->nodeType == XML_TEXT_NODE ||
 1929:             $child->nodeType == XML_ELEMENT_NODE || $child->nodeType == XML_CDATA_SECTION_NODE ||
 1930:             $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
 1931:         # these children require a paragraph
 1932:         if (!defined $p) {
 1933:           $p = $doc->createElement('p');
 1934:         }
 1935:         $p->appendChild($child);
 1936:       } else {
 1937:         # these children do not require a paragraph (XML comments, PI)
 1938:         # -> do not move them in a new paragraph
 1939:         if (defined $p) {
 1940:           push(@new_children, $p);
 1941:           $p = undef;
 1942:         }
 1943:         push(@new_children, $child);
 1944:       }
 1945:     }
 1946:     if (defined $p) {
 1947:       push(@new_children, $p);
 1948:     }
 1949:     foreach my $child (@new_children) {
 1950:       $node->appendChild($child);
 1951:     }
 1952:   }
 1953:   # now fix the paragraphs everywhere, so that all inline nodes are inside a paragraph, and block nodes are outside
 1954:   my $next;
 1955:   for (my $child=$node->firstChild; defined $child; $child=$next) {
 1956:     $next = $child->nextSibling;
 1957:     if ($child->nodeType == XML_ELEMENT_NODE && defined $child->firstChild) {
 1958:       if ($child->nodeName eq 'p') {
 1959:         fix_paragraph($child, $all_block);
 1960:       } else {
 1961:         fix_paragraphs_inside($child, $all_block);
 1962:       }
 1963:     }
 1964:   }
 1965: }
 1966: 
 1967: # returns 1 if a paragraph is needed inside this node (assuming the parent can have paragraphs)
 1968: sub paragraph_needed {
 1969:   my ($node) = @_;
 1970:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 1971:     if (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
 1972:         ($child->nodeType == XML_ELEMENT_NODE && !string_in_array(\@inline_like_block, $child->nodeName)) ||
 1973:         $child->nodeType == XML_CDATA_SECTION_NODE ||
 1974:         $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
 1975:       return(1);
 1976:     }
 1977:   }
 1978:   return(0);
 1979: }
 1980: 
 1981: # returns 1 if there is a paragraph or br in a child of this node, or inside an inline child
 1982: sub paragraph_inside {
 1983:   my ($node) = @_;
 1984:   # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
 1985:   # (also used in first_block below)
 1986:   my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
 1987:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 1988:     if ($child->nodeType == XML_ELEMENT_NODE) {
 1989:       my $name = $child->nodeName;
 1990:       if ($name eq 'p' || $name eq 'br') {
 1991:         return(1);
 1992:       } elsif (string_in_array(\@splitable_inline, $name)) {
 1993:         if (paragraph_inside($child)) {
 1994:           return(1);
 1995:         }
 1996:       }
 1997:     }
 1998:   }
 1999:   return(0);
 2000: }
 2001: 
 2002: # fixes paragraphs inside paragraphs (without a block in-between)
 2003: sub fix_paragraph {
 2004:   my ($p, $all_block) = @_;
 2005:   my $loop_right = 1; # this loops is to avoid out of memory errors with recurse, see below
 2006:   while ($loop_right) {
 2007:     $loop_right = 0;
 2008:     my $block = find_first_block($p, $all_block);
 2009:     if (defined $block) {
 2010:       my $trees = clone_ancestor_around_node($p, $block);
 2011:       my $doc = $p->ownerDocument;
 2012:       my $replacement = $doc->createDocumentFragment();
 2013:       my $left = $trees->{'left'};
 2014:       my $middle = $trees->{'middle'};
 2015:       my $right = $trees->{'right'};
 2016:       my $left_needs_p = 0; # 1 if it needs a paragraph (used to replace br later)
 2017:       
 2018:       if (defined $left) {
 2019:         # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
 2020:         for (my $child=$left->firstChild; defined $child; $child=$child->nextSibling) {
 2021:           if ($child->nodeType == XML_ELEMENT_NODE) {
 2022:             fix_paragraphs_inside($child, $all_block);
 2023:           }
 2024:         }
 2025:         if (!paragraph_needed($left)) {
 2026:           # this was just blank text, comments or inline responses, it should not create a new paragraph
 2027:           my $next;
 2028:           for (my $child=$left->firstChild; defined $child; $child=$next) {
 2029:             $next = $child->nextSibling;
 2030:             $left->removeChild($child);
 2031:             $replacement->appendChild($child);
 2032:           }
 2033:         } else {
 2034:           $left_needs_p = 1;
 2035:           $replacement->appendChild($left);
 2036:         }
 2037:       }
 2038:       
 2039:       my $n = $middle->firstChild;
 2040:       while (defined $n) {
 2041:         if ($n->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $n->nodeName) || $n->nodeName eq 'br')) {
 2042:           if ($n->nodeName eq 'p') {
 2043:             my $parent = $n->parentNode;
 2044:             # first apply recursion
 2045:             fix_paragraph($n, $all_block);
 2046:             # now the p might have been replaced by several nodes, which should replace the initial p
 2047:             my $next_block;
 2048:             for (my $block=$parent->firstChild; defined $block; $block=$next_block) {
 2049:               $next_block = $block->nextSibling;
 2050:               if ($block->nodeName eq 'p') {
 2051:                 $parent->removeChild($block);
 2052:                 # for each parent before $middle, clone in-between the p and its children (to preserve the styles)
 2053:                 if (defined $block->firstChild) {
 2054:                   for (my $p=$parent; $p!=$middle; $p=$p->parentNode) {
 2055:                     my $newp = $p->cloneNode(0);
 2056:                     my $next;
 2057:                     for (my $child=$block->firstChild; defined $child; $child=$next) {
 2058:                       $next = $child->nextSibling;
 2059:                       $block->removeChild($child);
 2060:                       $newp->appendChild($child);
 2061:                     }
 2062:                     $block->appendChild($newp);
 2063:                   }
 2064:                 }
 2065:               }
 2066:               $replacement->appendChild($block);
 2067:             }
 2068:           } else {
 2069:             # replace the whole p by this block, forgetting about intermediate inline elements
 2070:             $n->parentNode->removeChild($n);
 2071:             if ($n->nodeName eq 'br') {
 2072:               # replace a br by a paragraph if there was nothing before in the paragraph,
 2073:               # otherwise remove it because it already broke the paragraph in half
 2074:               if (!defined $left || !$left_needs_p) {
 2075:                 $replacement->appendChild($middle);
 2076:               }
 2077:               # ignore the next node if it is a br (the paragraph default margin will take as much space)
 2078:               my $first_right;
 2079:               if (defined $right) {
 2080:                 $first_right = $right->firstChild;
 2081:                 # ignore non-nbsp whitespace
 2082:                 while (defined $first_right && $first_right->nodeType == XML_TEXT_NODE &&
 2083:                     $first_right->nodeValue =~ /^[ \t\f\n\r]*$/) {
 2084:                   $first_right = $first_right->nextSibling;
 2085:                 }
 2086:               }
 2087:               if (defined $first_right && $first_right->nodeType == XML_ELEMENT_NODE &&
 2088:                   $first_right->nodeName eq 'br') {
 2089:                 $right->removeChild($first_right);
 2090:               }
 2091:             } else {
 2092:               fix_paragraphs_inside($n, $all_block);
 2093:               $replacement->appendChild($n);
 2094:             }
 2095:           }
 2096:           last;
 2097:         }
 2098:         $n = $n->firstChild;
 2099:         if (defined $n && defined $n->nextSibling) {
 2100:           die "Error in post_xml.fix_paragraph: block not found";
 2101:         }
 2102:       }
 2103:       
 2104:       if (defined $right) {
 2105:         if ($block->nodeName eq 'p') {
 2106:           # remove attributes on the right paragraph
 2107:           my @attributelist = $right->attributes();
 2108:           foreach my $att (@attributelist) {
 2109:             $right->removeAttribute($att->nodeName);
 2110:           }
 2111:         }
 2112:         if ($right->firstChild->nodeType == XML_TEXT_NODE && $right->firstChild->nodeValue =~ /^[ \t\f\n\r]*$/) {
 2113:           # remove the first text node with whitespace only from the p, it should not trigger the creation of a p
 2114:           # (but take nbsp into account, so we should not use \s here)
 2115:           my $first = $right->firstChild;
 2116:           $right->removeChild($first);
 2117:           $replacement->appendChild($first);
 2118:         }
 2119:         if (defined $right->firstChild) {
 2120:           if (paragraph_needed($right)) {
 2121:             $replacement->appendChild($right);
 2122:             #fix_paragraph($right, $all_block); This is taking way too much memory for blocks with many children
 2123:             # -> loop instead of recurse
 2124:             $loop_right = 1;
 2125:           } else {
 2126:             # this was just blank text, comments or inline responses, it should not create a new paragraph
 2127:             my $next;
 2128:             for (my $child=$right->firstChild; defined $child; $child=$next) {
 2129:               $next = $child->nextSibling;
 2130:               $right->removeChild($child);
 2131:               $replacement->appendChild($child);
 2132:               # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
 2133:               if ($child->nodeType == XML_ELEMENT_NODE) {
 2134:                 fix_paragraphs_inside($child, $all_block);
 2135:               }
 2136:             }
 2137:           }
 2138:         }
 2139:       }
 2140:       
 2141:       $p->parentNode->replaceChild($replacement, $p);
 2142:       
 2143:       if ($loop_right) {
 2144:         $p = $right;
 2145:       }
 2146:       
 2147:     } else {
 2148:       # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
 2149:       my $next;
 2150:       for (my $child=$p->firstChild; defined $child; $child=$next) {
 2151:         $next = $child->nextSibling;
 2152:         if ($child->nodeType == XML_ELEMENT_NODE) {
 2153:           fix_paragraphs_inside($child, $all_block);
 2154:         }
 2155:       }
 2156:     }
 2157:   }
 2158: }
 2159: 
 2160: sub find_first_block {
 2161:   my ($node, $all_block) = @_;
 2162:   # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
 2163:   my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
 2164:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 2165:     if ($child->nodeType == XML_ELEMENT_NODE) {
 2166:       if (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br') {
 2167:         return($child);
 2168:       }
 2169:       if (string_in_array(\@splitable_inline, $child->nodeName)) {
 2170:         my $block = find_first_block($child, $all_block);
 2171:         if (defined $block) {
 2172:           return($block);
 2173:         }
 2174:       }
 2175:     }
 2176:   }
 2177:   return(undef);
 2178: }
 2179: 
 2180: # Creates clones of the ancestor containing the descendants before the node, at the node, and after the node.
 2181: # returns a hash with: left, middle, right (left and right can be undef)
 2182: sub clone_ancestor_around_node {
 2183:   my ($ancestor, $node) = @_;
 2184:   my $middle_node;
 2185:   my ($left, $middle, $right);
 2186:   for (my $child=$ancestor->firstChild; defined $child; $child=$child->nextSibling) {
 2187:     if ($child == $node || is_ancestor_of($child, $node)) {
 2188:       $middle_node = $child;
 2189:       last;
 2190:     }
 2191:   }
 2192:   if (!defined $middle_node) {
 2193:     die "error in split_ancestor_around_node: middle not found";
 2194:   }
 2195:   if (defined $middle_node->previousSibling) {
 2196:     $left = $ancestor->cloneNode(0);
 2197:     for (my $child=$ancestor->firstChild; $child != $middle_node; $child=$child->nextSibling) {
 2198:       $left->appendChild($child->cloneNode(1));
 2199:     }
 2200:   }
 2201:   $middle = $ancestor->cloneNode(0);
 2202:   if ($middle_node == $node) {
 2203:     $middle->appendChild($middle_node->cloneNode(1));
 2204:   } else {
 2205:     my $subres = clone_ancestor_around_node($middle_node, $node);
 2206:     my $subleft = $subres->{'left'};
 2207:     if (defined $subleft) {
 2208:       if (!defined $left) {
 2209:         $left = $ancestor->cloneNode(0);
 2210:       }
 2211:       $left->appendChild($subleft);
 2212:     }
 2213:     $middle->appendChild($subres->{'middle'});
 2214:     my $subright = $subres->{'right'};
 2215:     if (defined $subright) {
 2216:       $right = $ancestor->cloneNode(0);
 2217:       $right->appendChild($subright);
 2218:     }
 2219:   }
 2220:   if (defined $middle_node->nextSibling) {
 2221:     if (!defined $right) {
 2222:       $right = $ancestor->cloneNode(0);
 2223:     }
 2224:     for (my $child=$middle_node->nextSibling; defined $child; $child=$child->nextSibling) {
 2225:       $right->appendChild($child->cloneNode(1));
 2226:     }
 2227:   }
 2228:   my %result = ();
 2229:   $result{'left'} = $left;
 2230:   $result{'middle'} = $middle;
 2231:   $result{'right'} = $right;
 2232:   return(\%result);
 2233: }
 2234: 
 2235: sub is_ancestor_of {
 2236:   my ($n1, $n2) = @_;
 2237:   my $n = $n2->parentNode;
 2238:   while (defined $n) {
 2239:     if ($n == $n1) {
 2240:       return(1);
 2241:     }
 2242:     $n = $n->parentNode;
 2243:   }
 2244:   return(0);
 2245: }
 2246: 
 2247: # removes empty style elements and replaces the ones with only whitespaces inside by their content
 2248: # also remove hints that have become empty after empty style removal.
 2249: sub remove_empty_style {
 2250:   my ($root) = @_;
 2251:   # actually, preserve some elements like ins when they have whitespace, only remove if they are empty
 2252:   my @remove_if_empty = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u', 'hint');
 2253:   my @remove_if_blank = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'font', 'u', 'hint');
 2254:   foreach my $name (@remove_if_empty) {
 2255:     my @nodes = $root->getElementsByTagName($name);
 2256:     while (scalar(@nodes) > 0) {
 2257:       my $node = pop(@nodes);
 2258:       if (!defined $node->firstChild) {
 2259:         my $parent = $node->parentNode;
 2260:         if (defined $node->previousSibling && $node->previousSibling->nodeType == XML_TEXT_NODE &&
 2261:             $node->previousSibling->nodeValue =~ /\$\S*$/) {
 2262:           # case $a<sup></sup>x
 2263:           my $value = $node->previousSibling->nodeValue;
 2264:           $value =~ s/\$(\S*)$/\$\{$1\}/;
 2265:           $node->previousSibling->setData($value);
 2266:         }
 2267:         $parent->removeChild($node);
 2268:         $parent->normalize();
 2269:         # now that we removed the node, check if the parent has become an empty style, and so on
 2270:         while (defined $parent && string_in_array(\@remove_if_empty, $parent->nodeName) && !defined $parent->firstChild) {
 2271:           my $grandparent = $parent->parentNode;
 2272:           $grandparent->removeChild($parent);
 2273:           remove_reference_from_array(\@nodes, $parent);
 2274:           $parent = $grandparent;
 2275:         }
 2276:       }
 2277:     }
 2278:   }
 2279:   foreach my $name (@remove_if_blank) {
 2280:     my @nodes = $root->getElementsByTagName($name);
 2281:     while (scalar(@nodes) > 0) {
 2282:       my $node = pop(@nodes);
 2283:       if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE) {
 2284:         # NOTE: careful, with UTF-8, \s matches non-breaking spaces and we want to preserve these
 2285:         if ($node->firstChild->nodeValue =~ /^[\t\n\f\r ]*$/) {
 2286:           my $parent = $node->parentNode;
 2287:           replace_by_children($node);
 2288:           $parent->normalize();
 2289:           # now that we removed the node, check if the parent has become a style with only whitespace, and so on
 2290:           while (defined $parent && string_in_array(\@remove_if_blank, $parent->nodeName) &&
 2291:               (!defined $parent->firstChild ||
 2292:               (!defined $parent->firstChild->nextSibling && $parent->firstChild->nodeType == XML_TEXT_NODE &&
 2293:               $parent->firstChild->nodeValue =~ /^^[\t\n\f\r ]*$/))) {
 2294:             my $grandparent = $parent->parentNode;
 2295:             replace_by_children($parent);
 2296:             remove_reference_from_array(\@nodes, $parent);
 2297:             $parent = $grandparent;
 2298:           }
 2299:         }
 2300:       }
 2301:     }
 2302:   }
 2303: }
 2304: 
 2305: # remove whitespace inside LON-CAPA elements that have an empty content-model (HTML ones are handled by html_to_xml)
 2306: sub fix_empty_lc_elements {
 2307:   my ($node) = @_;
 2308:   my @lcempty = ('arc','axis','backgroundplot','drawoptionlist','drawvectorsum','fill','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','hiddenline','hiddensubmission','key','line','location','organicstructure','parameter','plotobject','plotvector','responseparam','spline','textline');
 2309:   if (string_in_array(\@lcempty, $node->nodeName)) {
 2310:     if (defined $node->firstChild && !defined $node->firstChild->nextSibling &&
 2311:         $node->firstChild->nodeType == XML_TEXT_NODE && $node->firstChild->nodeValue =~ /^\s*$/) {
 2312:       $node->removeChild($node->firstChild);
 2313:     }
 2314:     if (defined $node->firstChild) {
 2315:       if ($warnings) {
 2316:         print "Warning: a ".$node->nodeName." has something inside\n";
 2317:       }
 2318:     }
 2319:     return;
 2320:   }
 2321:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 2322:     if ($child->nodeType == XML_ELEMENT_NODE) {
 2323:       fix_empty_lc_elements($child);
 2324:     }
 2325:   }
 2326: }
 2327: 
 2328: # remove consecutive empty paragraphs (they will not show anyway)
 2329: sub reduce_empty_p {
 2330:   my ($node) = @_;
 2331:   my $next;
 2332:   for (my $child=$node->firstChild; defined $child; $child=$next) {
 2333:     $next = $child->nextSibling;
 2334:     while (defined $next && $next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^[ \t\f\n\r]*$/) {
 2335:       $next = $next->nextSibling;
 2336:     }
 2337:     if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p' && defined $next &&
 2338:         $next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'p') {
 2339:       my $first = $child->firstChild;
 2340:       if (!defined $first || (!defined $first->nextSibling &&
 2341:           $first->nodeType == XML_TEXT_NODE && $first->nodeValue =~ /^[ \t\f\n\r]*$/)) {
 2342:         $first = $next->firstChild;
 2343:         if (!defined $first || (!defined $first->nextSibling &&
 2344:             $first->nodeType == XML_TEXT_NODE && $first->nodeValue =~ /^[ \t\f\n\r]*$/)) {
 2345:           $node->removeChild($child);
 2346:         }
 2347:       }
 2348:     }
 2349:     if ($child->nodeType == XML_ELEMENT_NODE) {
 2350:       reduce_empty_p($child);
 2351:     }
 2352:   }
 2353: }
 2354: 
 2355: # turn some attribute values into lowercase when they should be
 2356: sub lowercase_attribute_values {
 2357:   my ($root) = @_;
 2358:   my @with_yesno = (['radiobuttonresponse', ['randomize']],
 2359:                     ['optionresponse', ['randomize']],
 2360:                     ['matchresponse', ['randomize']],
 2361:                     ['itemgroup', ['randomize']],
 2362:                     ['rankresponse', ['randomize']],
 2363:                     ['functionplotresponse', ['xaxisvisible', 'yaxisvisible', 'gridvisible']],
 2364:                     ['backgroundplot', ['fixed']],
 2365:                     ['drawvectorsum', ['showvalue']],
 2366:                     ['textline', ['readonly']],
 2367:                     ['hint', ['showoncorrect']],
 2368:                     ['body', ['dir']],
 2369:                     ['img', ['encrypturl']],
 2370:                     ['form', ['method']],
 2371:                     ['input', ['type']]
 2372:                    );
 2373:   foreach my $el_attributes (@with_yesno) {
 2374:     my $el_name = $el_attributes->[0];
 2375:     my @elements = $root->getElementsByTagName($el_name);
 2376:     foreach my $element (@elements) {
 2377:       my $att_list = $el_attributes->[1];
 2378:       foreach my $att_name (@$att_list) {
 2379:         my $att_value = $element->getAttribute($att_name);
 2380:         if (!defined $att_value) {
 2381:           next;
 2382:         }
 2383:         if ($att_value eq 'yes' || $att_value eq 'no') {
 2384:           next;
 2385:         }
 2386:         if ($att_value =~ /\s*yes\s*/i) {
 2387:           $element->setAttribute($att_name, 'yes');
 2388:         } elsif ($att_value =~ /\s*no\s*/i) {
 2389:           $element->setAttribute($att_name, 'no');
 2390:         }
 2391:       }
 2392:     }
 2393:   }
 2394: }
 2395: 
 2396: # fixes spelling mistakes for numericalresponse/@unit
 2397: sub replace_numericalresponse_unit_attribute {
 2398:   my ($root) = @_;
 2399:   my @numericalresponses = $root->getElementsByTagName('numericalresponse');
 2400:   foreach my $numericalresponse (@numericalresponses) {
 2401:     if (defined $numericalresponse->getAttribute('units') && !defined $numericalresponse->getAttribute('unit')) {
 2402:       $numericalresponse->setAttribute('unit', $numericalresponse->getAttribute('units'));
 2403:       $numericalresponse->removeAttribute('units');
 2404:     }
 2405:   }
 2406:   
 2407: }
 2408: 
 2409: # Replaces &format and &prettyprint by <num> whenever possible.
 2410: # Also replaces &chemparse by <chem>.
 2411: # If the function call is enclosed in <display>, the <display> element is removed.
 2412: sub replace_functions_by_elements {
 2413:   my ($root) = @_;
 2414:   my $doc = $root->ownerDocument;
 2415:   my @preserve = ('script','answer','parse','m','tm','dtm','numericalhintscript'); # display is handled later
 2416:   my @all = $root->getElementsByTagName('*');
 2417:   foreach my $element (@all) {
 2418:     if (string_in_array(\@preserve, $element->nodeName)) {
 2419:       next;
 2420:     }
 2421:     my $changed = 0;
 2422:     my $next;
 2423:     for (my $child=$element->firstChild; defined $child; $child=$next) {
 2424:       $next = $child->nextSibling;
 2425:       if ($child->nodeType == XML_TEXT_NODE) {
 2426:         my $value = $child->nodeValue;
 2427:         if ($value =~ /^(.*)&(?:format|prettyprint)\((\$\{?[a-zA-Z0-9]*\}?(?:\[[^\]]*\])?|[0-9.]+)\s?,\s?(["'][,.\$]?[0-9][eEfFgGsS]["']|\$[a-zA-Z0-9]*)\)(.*)$/s) {
 2428:           # NOTE: we don't check for &prettyprint's 3rd argument (target), but it has not been seen outside of script elements.
 2429:           # NOTE: the format options ',' and '$' are not supported by &format in current LON-CAPA since rev 1.81 of default_homework.lcpm,
 2430:           #       but are supported by &prettyprint;
 2431:           #       if we use (like current LON-CAPA) &prettyprint for <num> implementation, it will change a few resulting documents
 2432:           #       (by making them display something they were probably intended to display, but which did not).
 2433:           #       Usage of <num> with &prettyprint instead of &format might also change the display when there is an exponent.
 2434:           my $before = $1;
 2435:           my $number = $2;
 2436:           my $format = $3;
 2437:           my $after = $4;
 2438:           $format =~ s/^['"]|['"]$//g;
 2439:           # do not change this if the parent is <display> and there are other things before or after &format
 2440:           if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
 2441:               $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
 2442:             last;
 2443:           }
 2444:           my $replacement = $doc->createDocumentFragment();
 2445:           my $num = $doc->createElement('num');
 2446:           $num->setAttribute('format', $format);
 2447:           $num->appendChild($doc->createTextNode($number));
 2448:           if (length($before) > 0) {
 2449:             $replacement->appendChild($doc->createTextNode($before));
 2450:           }
 2451:           $replacement->appendChild($num);
 2452:           if (length($after) > 0) {
 2453:             $replacement->appendChild($doc->createTextNode($after));
 2454:           }
 2455:           $element->replaceChild($replacement, $child);
 2456:           $changed = 1;
 2457:           $next = $element->firstChild; # start over, there might be another &format in the same text node
 2458:         } elsif ($value =~ /^(.*)&chemparse\(([^'"()]*|'[^']*'|"[^"]*")\)(.*)$/s) {
 2459:           my $before = $1;
 2460:           my $reaction = $2;
 2461:           my $after = $3;
 2462:           $reaction =~ s/^'(.*)'$/$1/;
 2463:           $reaction =~ s/^"(.*)"$/$1/;
 2464:           if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
 2465:               $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
 2466:             last;
 2467:           }
 2468:           my $replacement = $doc->createDocumentFragment();
 2469:           my $chem = $doc->createElement('chem');
 2470:           $chem->appendChild($doc->createTextNode($reaction));
 2471:           if (length($before) > 0) {
 2472:             $replacement->appendChild($doc->createTextNode($before));
 2473:           }
 2474:           $replacement->appendChild($chem);
 2475:           if (length($after) > 0) {
 2476:             $replacement->appendChild($doc->createTextNode($after));
 2477:           }
 2478:           $element->replaceChild($replacement, $child);
 2479:           $changed = 1;
 2480:           $next = $element->firstChild;
 2481:         }
 2482:       }
 2483:     }
 2484:     if ($changed && $element->nodeName eq 'display') {
 2485:       my $first = $element->firstChild;
 2486:       if ($first->nodeType == XML_ELEMENT_NODE && string_in_array(['num','chem'], $first->nodeName) &&
 2487:           !defined $first->nextSibling) {
 2488:         # remove useless display element
 2489:         replace_by_children($element);
 2490:       }
 2491:     }
 2492:   }
 2493: }
 2494: 
 2495: # pretty-print using im-memory DOM tree
 2496: sub pretty {
 2497:   my ($node, $all_block, $indent_level) = @_;
 2498:   my $doc = $node->ownerDocument;
 2499:   $indent_level ||= 0;
 2500:   my $type = $node->nodeType;
 2501:   if ($type == XML_ELEMENT_NODE) {
 2502:     my $name = $node->nodeName;
 2503:     if (string_in_array(\@preserve_elements, $name)) {
 2504:       # remove newlines at the beginning and the end of preserve elements
 2505:       if (defined $node->firstChild && ($node->firstChild->nodeType == XML_TEXT_NODE ||
 2506:           $node->firstChild->nodeType == XML_CDATA_SECTION_NODE)) {
 2507:         my $text = $node->firstChild->nodeValue;
 2508:         $text =~ s/^\n+//;
 2509:         if ($text eq '') {
 2510:           $node->removeChild($node->firstChild);
 2511:         } else {
 2512:           $node->firstChild->setData($text);
 2513:         }
 2514:       }
 2515:       if (defined $node->lastChild && ($node->lastChild->nodeType == XML_TEXT_NODE ||
 2516:           $node->lastChild->nodeType == XML_CDATA_SECTION_NODE)) {
 2517:         my $text = $node->lastChild->nodeValue;
 2518:         $text =~ s/\n+$//;
 2519:         if ($text eq '') {
 2520:           $node->removeChild($node->lastChild);
 2521:         } else {
 2522:           $node->lastChild->setData($text);
 2523:         }
 2524:       }
 2525:     } elsif (string_in_array($all_block, $name) || string_in_array(\@inline_like_block, $name)) {
 2526:       # make sure there is a newline at the beginning and at the end if there is anything inside
 2527:       if (defined $node->firstChild && !string_in_array(\@no_newline_inside, $name)) {
 2528:         my $first = $node->firstChild;
 2529:         if ($first->nodeType == XML_TEXT_NODE) {
 2530:           my $text = $first->nodeValue;
 2531:           if ($text !~ /^ *\n/) {
 2532:             $first->setData("\n" . $text);
 2533:           }
 2534:         } else {
 2535:           $node->insertBefore($doc->createTextNode("\n"), $first);
 2536:         }
 2537:         my $last = $node->lastChild;
 2538:         if ($last->nodeType == XML_TEXT_NODE) {
 2539:           my $text = $last->nodeValue;
 2540:           if ($text !~ /\n *$/) {
 2541:             $last->setData($text . "\n");
 2542:           }
 2543:         } else {
 2544:           $node->appendChild($doc->createTextNode("\n"));
 2545:         }
 2546:       }
 2547:       
 2548:       # indent and make sure there is a newline before and after a block element
 2549:       my $newline_indent = "\n".(' ' x (2*($indent_level + 1)));
 2550:       my $newline_indent_last = "\n".(' ' x (2*$indent_level));
 2551:       my $next;
 2552:       for (my $child=$node->firstChild; defined $child; $child=$next) {
 2553:         $next = $child->nextSibling;
 2554:         if ($child->nodeType == XML_ELEMENT_NODE) {
 2555:           if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
 2556:             # make sure there is a newline before and after a block element
 2557:             if (defined $child->previousSibling && $child->previousSibling->nodeType == XML_TEXT_NODE) {
 2558:               my $prev = $child->previousSibling;
 2559:               my $text = $prev->nodeValue;
 2560:               if ($text !~ /\n *$/) {
 2561:                 $prev->setData($text . $newline_indent);
 2562:               }
 2563:             } else {
 2564:               $node->insertBefore($doc->createTextNode($newline_indent), $child);
 2565:             }
 2566:             if (defined $next && $next->nodeType == XML_TEXT_NODE) {
 2567:               my $text = $next->nodeValue;
 2568:               if ($text !~ /^ *\n/) {
 2569:                 $next->setData($newline_indent . $text);
 2570:               }
 2571:             } else {
 2572:               $node->insertAfter($doc->createTextNode($newline_indent), $child);
 2573:             }
 2574:           }
 2575:           pretty($child, $all_block, $indent_level+1);
 2576:         } elsif ($child->nodeType == XML_TEXT_NODE) {
 2577:           my $text = $child->nodeValue;
 2578:           # collapse newlines
 2579:           $text =~ s/\n([\t ]*\n)+/\n/g;
 2580:           # indent and remove spaces and tabs before newlines
 2581:           if (defined $next) {
 2582:             $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
 2583:           } else {
 2584:             $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
 2585:             $text =~ s/[\t ]*\n[\t ]*$/$newline_indent_last/e;
 2586:           }
 2587:           $child->setData($text);
 2588:         }
 2589:       }
 2590:       
 2591:       # removes whitespace at the beginning and end of p td, th and li (except for nbsp at the beginning)
 2592:       my @to_trim = ('p','td','th','li');
 2593:       if (string_in_array(\@to_trim, $name) && defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
 2594:         my $text = $node->firstChild->nodeValue;
 2595:         $text =~ s/^[ \t\f\n\r]*//;
 2596:         if ($text eq '') {
 2597:           $node->removeChild($node->firstChild);
 2598:         } else {
 2599:           $node->firstChild->setData($text);
 2600:         }
 2601:       }
 2602:       if (string_in_array(\@to_trim, $name) && defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
 2603:         my $text = $node->lastChild->nodeValue;
 2604:         $text =~ s/\s*$//;
 2605:         if ($text eq '') {
 2606:           $node->removeChild($node->lastChild);
 2607:         } else {
 2608:           $node->lastChild->setData($text);
 2609:         }
 2610:       }
 2611:     }
 2612:   }
 2613: }
 2614: 
 2615: sub replace_tm_dtm {
 2616:   my ($root) = @_;
 2617:   my $doc = $root->ownerDocument;
 2618:   my @elements = $root->getElementsByTagName('tm');
 2619:   push(@elements, $root->getElementsByTagName('dtm'));
 2620:   foreach my $element (@elements) {
 2621:     my $first = $element->firstChild;
 2622:     if (defined $first && $first->nodeType == XML_TEXT_NODE) {
 2623:       my $text = $first->nodeValue;
 2624:       if ($element->nodeName eq 'tm') {
 2625:         $first->setData('$'.$text.'$');
 2626:       } else {
 2627:         $first->setData('$$'.$text.'$$');
 2628:       }
 2629:     }
 2630:     $element->setNodeName('m');
 2631:   }
 2632: }
 2633: 
 2634: 
 2635: ######## utilities ########
 2636: 
 2637: ##
 2638: # Trims a string (really, this should be built-in in Perl, this is ridiculous, ugly and slow)
 2639: # @param {string} s - the string to trim
 2640: # @returns the trimmed string
 2641: ##
 2642: sub trim {
 2643:   my ($s) = @_;
 2644:   $s =~ s/^\s+//;
 2645:   $s =~ s/\s+$//;
 2646:   return($s);
 2647: }
 2648: 
 2649: ##
 2650: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
 2651: # @param {Array<string>} array - reference to the array of strings
 2652: # @param {string} value - the string to look for
 2653: # @returns 1 if found, 0 otherwise
 2654: ##
 2655: sub string_in_array {
 2656:   my ($array, $value) = @_;
 2657:   # NOTE: would it be faster to use "any { $value eq $_ } @{$array}" from List::Util ?
 2658:   foreach my $v (@{$array}) {
 2659:     if ($v eq $value) {
 2660:       return 1;
 2661:     }
 2662:   }
 2663:   return 0;
 2664: }
 2665: 
 2666: ##
 2667: # Tests if an object is in an array (using ==)
 2668: # @param {Array<Object>} array - reference to the array of references
 2669: # @param {Object} ref - the reference to look for
 2670: # @returns 1 if found, 0 otherwise
 2671: ##
 2672: sub reference_in_array {
 2673:   my ($array, $ref) = @_;
 2674:   foreach my $v (@{$array}) {
 2675:     if ($v == $ref) {
 2676:       return 1;
 2677:     }
 2678:   }
 2679:   return 0;
 2680: }
 2681: 
 2682: ##
 2683: # returns the index of a string in an array
 2684: # @param {Array<Object>} array - reference to the array of strings
 2685: # @param {string} s - the string to look for (using eq)
 2686: # @returns the index if found, -1 otherwise
 2687: ##
 2688: sub index_of_string {
 2689:   my ($array, $s) = @_;
 2690:   for (my $i=0; $i<scalar(@{$array}); $i++) {
 2691:     if ($array->[$i] eq $s) {
 2692:       return $i;
 2693:     }
 2694:   }
 2695:   return -1;
 2696: }
 2697: 
 2698: ##
 2699: # returns the index of a reference in an array
 2700: # @param {Array<Object>} array - reference to the array of references
 2701: # @param {Object} ref - the reference to look for
 2702: # @returns the index if found, -1 otherwise
 2703: ##
 2704: sub index_of_reference {
 2705:   my ($array, $ref) = @_;
 2706:   for (my $i=0; $i<scalar(@{$array}); $i++) {
 2707:     if ($array->[$i] == $ref) {
 2708:       return $i;
 2709:     }
 2710:   }
 2711:   return -1;
 2712: }
 2713: 
 2714: ##
 2715: # if found, removes a string from an array, otherwise do nothing
 2716: # @param {Array<string>} array - reference to the array of string
 2717: # @param {string} s - the string to look for (using eq)
 2718: ##
 2719: sub remove_string_from_array {
 2720:   my ($array, $s) = @_;
 2721:   my $index = index_of_string($array, $s);
 2722:   if ($index != -1) {
 2723:     splice(@$array, $index, 1);
 2724:   }
 2725: }
 2726: 
 2727: ##
 2728: # if found, removes a reference from an array, otherwise do nothing
 2729: # @param {Array<Object>} array - reference to the array of references
 2730: # @param {Object} ref - the reference to look for
 2731: ##
 2732: sub remove_reference_from_array {
 2733:   my ($array, $ref) = @_;
 2734:   my $index = index_of_reference($array, $ref);
 2735:   if ($index != -1) {
 2736:     splice(@$array, $index, 1);
 2737:   }
 2738: }
 2739: 
 2740: ##
 2741: # replaces a node by its children
 2742: # @param {Node} node - the DOM node
 2743: ##
 2744: sub replace_by_children {
 2745:   my ($node) = @_;
 2746:   my $parent = $node->parentNode;
 2747:   my $next;
 2748:   my $previous;
 2749:   for (my $child=$node->firstChild; defined $child; $child=$next) {
 2750:     $next = $child->nextSibling;
 2751:     if ((!defined $previous || !defined $next) &&
 2752:         $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
 2753:       next; # do not keep first and last whitespace nodes
 2754:     } else {
 2755:       if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
 2756:         # remove whitespace at the beginning
 2757:         my $value = $child->nodeValue;
 2758:         $value =~ s/^\s+//;
 2759:         $child->setData($value);
 2760:       }
 2761:       if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
 2762:         # and at the end
 2763:         my $value = $child->nodeValue;
 2764:         $value =~ s/\s+$//;
 2765:         $child->setData($value);
 2766:       }
 2767:     }
 2768:     $node->removeChild($child);
 2769:     $parent->insertBefore($child, $node);
 2770:     $previous = $child;
 2771:   }
 2772:   $parent->removeChild($node);
 2773: }
 2774: 
 2775: ##
 2776: # returns the trimmed attribute value if the attribute exists and is not blank, undef otherwise
 2777: # @param {Node} node - the DOM node
 2778: # @param {string} attribute_name - the attribute name
 2779: ##
 2780: sub get_non_empty_attribute {
 2781:   my ($node, $attribute_name) = @_;
 2782:   my $value = $node->getAttribute($attribute_name);
 2783:   if (defined $value && $value !~ /^\s*$/) {
 2784:     $value = trim($value);
 2785:     return($value);
 2786:   }
 2787:   return(undef);
 2788: }
 2789: 
 2790: ##
 2791: # Returns a CSS property value from the style attribute of the element, or undef if not defined
 2792: # @param {Element} el - the DOM element
 2793: # @param {string} property_name - the CSS property name
 2794: ##
 2795: sub get_css_property {
 2796:   my ($el, $property_name) = @_;
 2797:   my $style = $el->getAttribute('style');
 2798:   if (defined $style) {
 2799:     $style =~ s/^\s*;\s*//;
 2800:     $style =~ s/\s*;\s*$//;
 2801:   } else {
 2802:     $style = '';
 2803:   }
 2804:   my @pairs = split(';', $style);
 2805:   foreach my $pair (@pairs) {
 2806:     my @name_value = split(':', $pair);
 2807:     if (scalar(@name_value) != 2) {
 2808:       next;
 2809:     }
 2810:     my $name = trim($name_value[0]);
 2811:     my $value = trim($name_value[1]);
 2812:     if (lc($name) eq $property_name) {
 2813:       return($value); # return the first one found
 2814:     }
 2815:   }
 2816:   return(undef);
 2817: }
 2818: 
 2819: ##
 2820: # Returns the reference to a hash CSS property name => value from the style attribute of the element.
 2821: # Returns an empty list if the style attribute is not defined,
 2822: # @param {Element} el - the DOM element
 2823: # @return {Hash<string, string>} reference to the hash  property name => property value
 2824: ##
 2825: sub get_css_properties {
 2826:   my ($el) = @_;
 2827:   my $style = $el->getAttribute('style');
 2828:   if (defined $style) {
 2829:     $style =~ s/^\s*;\s*//;
 2830:     $style =~ s/\s*;\s*$//;
 2831:   } else {
 2832:     $style = '';
 2833:   }
 2834:   my @pairs = split(';', $style);
 2835:   tie (my %hash, 'Tie::IxHash', ());
 2836:   foreach my $pair (@pairs) {
 2837:     my @name_value = split(':', $pair);
 2838:     if (scalar(@name_value) != 2) {
 2839:       next;
 2840:     }
 2841:     my $name = trim($name_value[0]);
 2842:     my $value = trim($name_value[1]);
 2843:     if (defined $hash{$name}) {
 2844:       # duplicate property in the style attribute: keep only the last one
 2845:       delete $hash{$name};
 2846:     }
 2847:     $hash{$name} = $value;
 2848:   }
 2849:   return(\%hash);
 2850: }
 2851: 
 2852: ##
 2853: # Sets a CSS property in the style attribute of an element
 2854: # @param {Element} el - the DOM element
 2855: # @param {string} property_name - the CSS property name
 2856: # @param {string} property_value - the CSS property value
 2857: ##
 2858: sub set_css_property {
 2859:   my ($el, $property_name, $property_value) = @_;
 2860:   my $hash_ref = { $property_name => $property_value };
 2861:   set_css_properties($el, $hash_ref);
 2862: }
 2863: 
 2864: ##
 2865: # Sets several CSS properties in the style attribute of an element
 2866: # @param {Element} el - the DOM element
 2867: # @param {Hash<string, string>} properties - reference to the hash property name => property value
 2868: ##
 2869: sub set_css_properties {
 2870:   my ($el, $properties) = @_;
 2871:   my $hash = get_css_properties($el);
 2872:   foreach my $property_name (keys %$properties) {
 2873:     my $property_value = $properties->{$property_name};
 2874:     if (defined $hash->{$property_name}) {
 2875:       delete $hash->{$property_name}; # to add the new one at the end
 2876:     }
 2877:     $hash->{$property_name} = $property_value;
 2878:   }
 2879:   my $style = '';
 2880:   foreach my $key (keys %$hash) {
 2881:     $style .= $key.':'.$hash->{$key}.'; ';
 2882:   }
 2883:   $style =~ s/; $//;
 2884:   $el->setAttribute('style', $style);
 2885: }
 2886: 
 2887: 1;
 2888: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>