File:  [LON-CAPA] / loncom / homework / cleanxml / post_xml.pm
Revision 1.7: download - view: text, annotated - select for diffs
Thu Jan 14 16:28:06 2016 UTC (8 years, 3 months ago) by damieng
Branches: MAIN
CVS tags: HEAD
fixed calls to html_to_xml and tth

    1: # The LearningOnline Network
    2: # Third step to clean a file.
    3: #
    4: # $Id: post_xml.pm,v 1.7 2016/01/14 16:28:06 damieng Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ###
   29: 
   30: #!/usr/bin/perl
   31: 
   32: package Apache::post_xml;
   33: 
   34: use strict;
   35: use utf8;
   36: use warnings;
   37: 
   38: use File::Basename;
   39: use File::Temp qw/ tempfile /;
   40: use Cwd 'abs_path';
   41: use XML::LibXML;
   42: use HTML::TokeParser; # used to parse sty files
   43: use Tie::IxHash; # for ordered hashes
   44: use tth;
   45: use Apache::html_to_xml;
   46: 
   47: no warnings 'recursion'; # yes, fix_paragraph is using heavy recursion, I know
   48: 
   49: # these are constants
   50: my @block_elements = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm');
   51: my @inline_like_block = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup','translated','lang'); # inline elements treated like blocks for pretty print and some other things
   52: my @responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse','essayresponse','radiobuttonresponse','matchresponse','rankresponse','imageresponse','functionplotresponse');
   53: my @block_html = ('html','head','body','section','h1','h2','h3','h4','h5','h6','div','p','ul','ol','li','table','tbody','tr','td','th','dl','dt','dd','pre','noscript','hr','address','blockquote','object','applet','embed','map','form','fieldset','iframe','center','frameset');
   54: my @no_newline_inside = ('import','parserlib','scriptlib','data','function','label','xlabel','ylabel','tic','text','rectangle','image','title','h1','h2','h3','h4','h5','h6','li','td','p');
   55: my @preserve_elements = ('script','answer','pre','style');
   56: my @accepting_style = ('section','h1','h2','h3','h4','h5','h6','div','p','li','td','th','dt','dd','pre','blockquote');
   57: my @latex_math = ('\alpha', '\theta', '\omicron', '\tau', '\beta', '\vartheta', '\pi', '\upsilon', '\gamma', '\gamma', '\varpi', '\phi', '\delta', '\kappa', '\rho', '\varphi', '\epsilon', '\lambda', '\varrho', '\chi', '\varepsilon', '\mu', '\sigma', '\psi', '\zeta', '\nu', '\varsigma', '\omega', '\eta', '\xi',
   58:   '\Gamma', '\Lambda', '\Sigma', '\Psi', '\Delta', '\Xi', '\Upsilon', '\Omega', '\Theta', '\Pi', '\Phi',
   59:   '\pm', '\cap', '\diamond', '\oplus', '\mp', '\cup', '\bigtriangleup', '\ominus', '\times', '\uplus', '\bigtriangledown', '\otimes', '\div', '\sqcap', '\triangleleft', '\oslash', '\ast', '\sqcup', '\triangleright', '\odot', '\star', '\vee', '\lhd$', '\bigcirc', '\circ', '\wedge', '\rhd$', '\dagger', '\bullet', '\setminus', '\unlhd$', '\ddagger', '\cdot', '\wr', '\unrhd$', '\amalg', '+', '-',
   60:   '\leq', '\geq', '\equiv', '\models', '\prec', '\succ', '\sim', '\perp', '\preceq', '\succeq', '\simeq', '\mid', '\ll', '\gg', '\asymp', '\parallel', '\subset', '\supset', '\approx', '\bowtie', '\subseteq', '\supseteq', '\cong', '\Join$', '\sqsubset$', '\sqsupset$', '\neq', '\smile', '\sqsubseteq', '\sqsupseteq', '\doteq', '\frown', '\in', '\ni', '\propto', '\vdash', '\dashv',
   61:   '\colon', '\ldotp', '\cdotp',
   62:   '\leftarrow', '\longleftarrow', '\uparrow', '\Leftarrow', '\Longleftarrow', '\Uparrow', '\rightarrow', '\longrightarrow', '\downarrow', '\Rightarrow', '\Longrightarrow', '\Downarrow', '\leftrightarrow', '\longleftrightarrow', '\updownarrow', '\Leftrightarrow', '\Longleftrightarrow', '\Updownarrow', '\mapsto', '\longmapsto', '\nearrow', '\hookleftarrow', '\hookrightarrow', '\searrow', '\leftharpoonup', '\rightharpoonup', '\swarrow', '\leftharpoondown', '\rightharpoondown', '\nwarrow', '\rightleftharpoons', '\leadsto$',
   63:   '\ldots', '\cdots', '\vdots', '\ddots', '\aleph', '\prime', '\forall', '\infty', '\hbar', '\emptyset', '\exists', '\Box$', '\imath', '\nabla', '\neg', '\Diamond$', '\jmath', '\surd', '\flat', '\triangle', '\ell', '\top', '\natural', '\clubsuit', '\wp', '\bot', '\sharp', '\diamondsuit', '\Re', '\|', '\backslash', '\heartsuit', '\Im', '\angle', '\partial', '\spadesuit', '\mho$',
   64:   '\sum', '\bigcap', '\bigodot', '\prod', '\bigcup', '\bigotimes', '\coprod', '\bigsqcup', '\bigoplus', '\int', '\bigvee', '\biguplus', '\oint', '\bigwedge',
   65:   '\arccos', '\cos', '\csc', '\exp', '\ker', '\limsup', '\min', '\sinh', '\arcsin', '\cosh', '\deg', '\gcd', '\lg', '\ln', '\Pr', '\sup', '\arctan', '\cot', '\det', '\hom', '\lim', '\log', '\sec', '\tan', '\arg', '\coth', '\dim', '\inf', '\liminf', '\max', '\sin', '\tanh',
   66:   '\uparrow', '\Uparrow', '\downarrow', '\Downarrow', '\updownarrow', '\Updownarrow', '\lfloor', '\rfloor', '\lceil', '\rceil', '\langle', '\rangle', '\backslash',
   67:   '\rmoustache', '\lmoustache', '\rgroup', '\lgroup', '\arrowvert', '\Arrowvert', '\bracevert',
   68:   '\hat{', '\acute{', '\bar{', '\dot{', '\breve{', '\check{', '\grave{', '\vec{', '\ddot{', '\tilde{',
   69:   '\widetilde{', '\widehat{', '\overleftarrow{', '\overrightarrow{', '\overline{', '\underline{', '\overbrace{', '\underbrace{', '\sqrt{', '\sqrt[', '\frac{'
   70: );
   71: # list of elements that can contain style elements:
   72: my @containing_styles = ('library','problem',@responses,'foil','item','text','hintgroup','hintpart','label','part','preduedate','postanswerdate','solved','notsolved','block','while','web','standalone','problemtype','languageblock','translated','lang','window','windowlink','togglebox','instructorcomment','body','section','div','p','li','dd','td','th','blockquote','object','applet','video','audio','canvas','fieldset','button',
   73: 'span','strong','em','b','i','sup','sub','code','kbd','samp','tt','ins','del','var','small','big','u','font');
   74: my @html_styles = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'u');
   75: 
   76: my $warnings; # 1 = print warnings
   77: 
   78: 
   79: # Parses the XML document and fixes many things to turn it into a document matching the schema.
   80: # @param {reference} textref - reference to the text of the document
   81: # @param {string} file_path - path of the file being processed (we only extract the directory path from it)
   82: # @param {string} res_dir - path of res directory parent (without the / at the end)
   83: # @param {boolean} warn - 1 to print warnings, 0 otherwise
   84: # @returns the text of the document as a byte string.
   85: sub post_xml {
   86:   my ($textref, $file_path, $res_dir, $warn) = @_;
   87:   $warnings = $warn;
   88:   
   89:   my $dom_doc = XML::LibXML->load_xml(string => $textref);
   90: 
   91:   my $root = fix_structure($dom_doc);
   92: 
   93:   remove_elements($root, ['startouttext','startoutext','startottext','startouttex','startouttect','atartouttext','starouttext','starttextout','starttext','starttextarea','endouttext','endoutext','endoutttext','endouttxt','endouutext','ednouttext','endouttex','endoouttext','endouttest','endtextout','endtextarea','startpartmarker','endpartmarker','basefont','x-claris-tagview','x-claris-window','x-sas-window']);
   94:   
   95:   remove_empty_attributes($root);
   96:   
   97:   fix_attribute_case($root);
   98:   
   99:   replace_m($root);
  100:   
  101:   my @all_block = (@block_elements, @block_html);
  102:   add_sty_blocks($file_path, $res_dir, $root, \@all_block); # must come before the subs using @all_block
  103: 
  104:   fix_block_styles($root, \@all_block);
  105:   $root->normalize();
  106:   
  107:   fix_fonts($root, \@all_block);
  108:   
  109:   replace_u($root);
  110: 
  111:   remove_bad_cdata_sections($root);
  112:   
  113:   add_cdata_sections($root);
  114:   
  115:   fix_style_element($root);
  116:   
  117:   fix_tables($root);
  118: 
  119:   fix_lists($root);
  120:   
  121:   fix_wrong_name_for_img($root); # should be before replace_deprecated_attributes_by_css
  122: 
  123:   replace_deprecated_attributes_by_css($root);
  124:   
  125:   replace_center($root, \@all_block); # must come after replace_deprecated_attributes_by_css
  126:   
  127:   replace_nobr($root);
  128:   
  129:   remove_useless_notsolved($root);
  130:   
  131:   fix_paragraphs_inside($root, \@all_block);
  132: 
  133:   remove_empty_style($root);
  134:   
  135:   fix_empty_lc_elements($root);
  136:   
  137:   lowercase_attribute_values($root);
  138:   
  139:   replace_numericalresponse_unit_attribute($root);
  140:   
  141:   replace_functions_by_elements($root);
  142:   
  143:   pretty($root, \@all_block);
  144: 
  145:   replace_tm_dtm($root);
  146:   
  147:   return $dom_doc->toString(); # byte string !
  148: }
  149: 
  150: sub fix_structure {
  151:   my ($doc) = @_;
  152:   # the root element has already been added in pre_xml
  153:   my $root = $doc->documentElement;
  154:   # inside the root, replace html, problem and library elements by their content
  155:   my @toreplace = ('html','problem','library','Task');
  156:   foreach my $name (@toreplace) {
  157:     my @elements = $root->getElementsByTagName($name);
  158:     foreach my $element (@elements) {
  159:       replace_by_children($element);
  160:     }
  161:   }
  162:   # insert all link and style elements inside a new head element
  163:   my $current_node = undef;
  164:   my @heads = $doc->getElementsByTagName('head');
  165:   my @links = $doc->getElementsByTagName('link');
  166:   my @styles = $doc->getElementsByTagName('style');
  167:   my @titles = $doc->getElementsByTagName('title');
  168:   if (scalar(@titles) > 0) {
  169:     # NOTE: there is a title element in gnuplot, not to be confused with the one inside HTML head
  170:     for (my $i=0; $i<scalar(@titles); $i++) {
  171:       my $title = $titles[$i];
  172:       my $found_gnuplot = 0;
  173:       my $ancestor = $title->parentNode;
  174:       while (defined $ancestor) {
  175:         if ($ancestor->nodeName eq 'gnuplot') {
  176:           $found_gnuplot = 1;
  177:           last;
  178:         }
  179:         $ancestor = $ancestor->parentNode;
  180:       }
  181:       if ($found_gnuplot) {
  182:         splice(@titles, $i, 1);
  183:         $i--;
  184:       }
  185:     }
  186:   }
  187:   if (scalar(@heads) > 0 || scalar(@titles) > 0 || scalar(@links) > 0 || scalar(@styles) > 0) {
  188:     my $htmlhead = $doc->createElement('head');
  189:     foreach my $head (@heads) {
  190:       my $next;
  191:       for (my $child=$head->firstChild; defined $child; $child=$next) {
  192:         $next = $child->nextSibling;
  193:         $head->removeChild($child);
  194:         if ($child->nodeType != XML_ELEMENT_NODE ||
  195:             string_in_array(['title','script','style','meta','link','import','base'], $child->nodeName)) {
  196:           $htmlhead->appendChild($child);
  197:         } else {
  198:           # this should not be in head
  199:           insert_after_or_first($root, $child, $current_node);
  200:         }
  201:       }
  202:       $head->parentNode->removeChild($head);
  203:     }
  204:     foreach my $child (@titles, @links, @styles) {
  205:       $child->parentNode->removeChild($child);
  206:       $htmlhead->appendChild($child);
  207:     }
  208:     insert_after_or_first($root, $htmlhead, $current_node);
  209:     $current_node = $htmlhead;
  210:   }
  211:   # body
  212:   my $htmlbody = undef;
  213:   my @bodies = $doc->getElementsByTagName('body');
  214:   if (scalar(@bodies) > 0) {
  215:     # TODO: fix content and position of body elements
  216:     if ($root->nodeName eq 'problem') {
  217:       foreach my $body (@bodies) {
  218:         replace_by_children($body);
  219:       }
  220:     }
  221:   }
  222:   # add all the meta elements afterwards when they are LON-CAPA meta. Remove all HTML meta.
  223:   my @meta_names = ('abstract','author','authorspace','avetries','avetries_list','clear','comefrom','comefrom_list','copyright','correct','count','course','course_list','courserestricted','creationdate','dependencies','depth','difficulty','difficulty_list','disc','disc_list','domain','end','field','firstname','generation','goto','goto_list','groupname','helpful','highestgradelevel','hostname','id','keynum','keywords','language','lastname','lastrevisiondate','lowestgradelevel','middlename','mime','modifyinguser','notes','owner','permanentemail','scope','sequsage','sequsage_list','standards','start','stdno','stdno_list','subject','technical','title','url','username','value','version');
  224:   my @metas = $doc->getElementsByTagName('meta');
  225:   foreach my $meta (@metas) {
  226:     $meta->parentNode->removeChild($meta);
  227:     my $name = $meta->getAttribute('name');
  228:     my $content = $meta->getAttribute('content');
  229:     if (defined $name && defined $content && string_in_array(\@meta_names, lc($name))) {
  230:       my $lcmeta = $doc->createElement('meta');
  231:       $lcmeta->setAttribute('name', lc($name));
  232:       $lcmeta->setAttribute('content', $content);
  233:       insert_after_or_first($root, $lcmeta, $current_node);
  234:       $current_node = $lcmeta;
  235:     }
  236:   }
  237:   return($root);
  238: }
  239: 
  240: # insert the new child under parent after the reference child, or as the first child if the reference child is not defined
  241: sub insert_after_or_first {
  242:   my ($parent, $newchild, $refchild) = @_;
  243:   if (defined $refchild) {
  244:     $parent->insertAfter($newchild, $refchild);
  245:   } elsif (defined $parent->firstChild) {
  246:     $parent->insertBefore($newchild, $parent->firstChild);
  247:   } else {
  248:     $parent->appendChild($newchild);
  249:   }
  250: }
  251: 
  252: # removes all elements with given names inside the node, but keep the content
  253: sub remove_elements {
  254:   my ($node, $to_remove) = @_;
  255:   my $nextChild;
  256:   for (my $child=$node->firstChild; defined $child; $child=$nextChild) {
  257:     $nextChild = $child->nextSibling;
  258:     my $type = $node->nodeType;
  259:     if ($type == XML_ELEMENT_NODE) {
  260:       if (string_in_array($to_remove, $child->nodeName)) {
  261:         my $first_non_white = $child->firstChild;
  262:         if (defined $first_non_white && $first_non_white->nodeType == XML_TEXT_NODE &&
  263:             $first_non_white->nodeValue =~ /^\s*$/) {
  264:           $first_non_white = $first_non_white->nextSibling;
  265:         }
  266:         if (defined $first_non_white) {
  267:           $nextChild = $first_non_white;
  268:           replace_by_children($child);
  269:         } else {
  270:           $node->removeChild($child);
  271:         }
  272:       } else {
  273:         remove_elements($child, $to_remove);
  274:       }
  275:     }
  276:   }
  277: }
  278: 
  279: # removes some attributes that have an invalid empty value
  280: sub remove_empty_attributes {
  281:   my ($root) = @_;
  282:   my $doc = $root->ownerDocument;
  283:   # this list is based on validation errors in the MSU subset (it could be more complete if it was based on the schema)
  284:   my @attributes = (
  285:     ['curve', ['pointsize']],
  286:     ['foil', ['location']],
  287:     ['foilgroup', ['checkboxoptions', 'options', 'texoptions']],
  288:     ['gnuplot', ['pattern', 'texwidth']],
  289:     ['img', ['height', 'texheight', 'texwidth', 'texwrap', 'width']],
  290:     ['import', ['importmode']],
  291:     ['optionresponse', ['max']],
  292:     ['organicstructure', ['options']],
  293:     ['radiobuttonresponse', ['max']],
  294:     ['randomlabel', ['height', 'texwidth', 'width']],
  295:     ['stringresponse', ['type']],
  296:     ['textline', ['size']],
  297:   );
  298:   foreach my $element_attributes (@attributes) {
  299:     my $element_name = $element_attributes->[0];
  300:     my $attribute_names = $element_attributes->[1];
  301:     my @elements = $doc->getElementsByTagName($element_name);
  302:     foreach my $element (@elements) {
  303:       foreach my $attribute_name (@$attribute_names) {
  304:         my $value = $element->getAttribute($attribute_name);
  305:         if (defined $value && $value =~ /^\s*$/) {
  306:           $element->removeAttribute($attribute_name);
  307:         }
  308:       }
  309:     }
  310:   }
  311: }
  312: 
  313: # fixes the case for a few attributes that are not all lowercase
  314: # (the HTML parser used in html_to_xml turns everything lowercase, which is a good thing in general)
  315: sub fix_attribute_case {
  316:   my ($root) = @_;
  317:   my $doc = $root->ownerDocument;
  318:   my @attributes = (
  319:     ['labelgroup', ['TeXsize']],
  320:     ['h1', ['TeXsize']],
  321:     ['h2', ['TeXsize']],
  322:     ['h3', ['TeXsize']],
  323:     ['h4', ['TeXsize']],
  324:     ['h5', ['TeXsize']],
  325:     ['h6', ['TeXsize']],
  326:     # font and basefont have a TeXsize but will be removed
  327:     ['optionresponse', ['TeXlayout']],
  328:     ['itemgroup', ['TeXitemgroupwidth']],
  329:     ['Task', ['OptionalRequired']],
  330:     ['Question', ['OptionalRequired','Mandatory']],
  331:     ['Instance', ['OptionalRequired','Disabled']],
  332:     ['Criteria', ['Mandatory']],
  333:     ['table', ['TeXwidth','TeXtheme']],
  334:     ['td', ['TeXwidth']],
  335:     ['th', ['TeXwidth']],
  336:     ['img', ['TeXwidth','TeXheight','TeXwrap']],
  337:   );
  338:   foreach my $element_attributes (@attributes) {
  339:     my $element_name = $element_attributes->[0];
  340:     my $attribute_names = $element_attributes->[1];
  341:     my @elements = $doc->getElementsByTagName($element_name);
  342:     foreach my $element (@elements) {
  343:       foreach my $attribute_name (@$attribute_names) {
  344:         my $value = $element->getAttribute(lc($attribute_name));
  345:         if (defined $value) {
  346:           $element->removeAttribute(lc($attribute_name));
  347:           $element->setAttribute($attribute_name, $value);
  348:         }
  349:       }
  350:     }
  351:   }
  352: }
  353: 
  354: # Replaces m by HTML, tm and/or dtm (which will be replaced by <m> later, but they are useful
  355: #   to know if the element is a block element or not).
  356: # m might contain non-math LaTeX, while tm and dtm may only contain math.
  357: sub replace_m {
  358:   my ($root) = @_;
  359:   my $doc = $root->ownerDocument;
  360:   # search for variable declarations
  361:   my @variables = ();
  362:   my @scripts = $root->getElementsByTagName('script');
  363:   foreach my $script (@scripts) {
  364:     my $type = $script->getAttribute('type');
  365:     if (defined $type && $type eq 'loncapa/perl') {
  366:       if (defined $script->firstChild && $script->firstChild->nodeType == XML_TEXT_NODE) {
  367:         my $text = $script->firstChild->nodeValue;
  368:         # NOTE: we are not interested in replacing "@value", only "$value"
  369:         # this regexp is for "  $a = ..." and "  $a[...] = ..."
  370:         while ($text =~ /^[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
  371:           if (!string_in_array(\@variables, $1)) {
  372:             push(@variables, $1);
  373:           }
  374:         }
  375:         # this regexp is for "...;  $a = ..." and "...;  $a[...] = ..."
  376:         while ($text =~ /^[^'"\/;]+;[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
  377:           if (!string_in_array(\@variables, $1)) {
  378:             push(@variables, $1);
  379:           }
  380:         }
  381:         # this regexp is for "  @a = ..."
  382:         while ($text =~ /^[ \t]*\@([a-zA-Z_0-9]+)[ \t]*=/gm) {
  383:           if (!string_in_array(\@variables, $1)) {
  384:             push(@variables, $1);
  385:           }
  386:         }
  387:         # this regexp is for "  ($a, $b, $c) = ..."
  388:         my @matches = ($text =~ /^[ \t]*\([ \t]*\$([a-zA-Z_0-9]+)(?:[ \t]*,[ \t]*\$([a-zA-Z_0-9]+))*[ \t]*\)[ \t]*=/gm);
  389:         foreach my $match (@matches) {
  390:           if (!defined $match) {
  391:             next; # not sure why it happens, but it does
  392:           }
  393:           if (!string_in_array(\@variables, $match)) {
  394:             push(@variables, $match);
  395:           }
  396:         }
  397:         # and this one is for "push @a"
  398:         while ($text =~ /^[ \t]*push @([a-zA-Z_0-9]+)[ \t,]*/gm) {
  399:           if (!string_in_array(\@variables, $1)) {
  400:             push(@variables, $1);
  401:           }
  402:         }
  403:         # use the opportunity to report usage of <m> in Perl scripts
  404:         if ($text =~ /^[^#].*<m[ >]/m) {
  405:           if ($warnings) {
  406:             print "WARNING: <m> is used in a script, it should be converted by hand\n";
  407:           }
  408:         }
  409:       }
  410:     }
  411:   }
  412:   my @ms = $root->getElementsByTagName('m');
  413:   foreach my $m (@ms) {
  414:     if (!defined $m->firstChild) {
  415:       $m->parentNode->removeChild($m);
  416:       next;
  417:     }
  418:     if (defined $m->firstChild->nextSibling || $m->firstChild->nodeType != XML_TEXT_NODE) {
  419:       if ($warnings) {
  420:         print "WARNING: m value is not simple text\n";
  421:       }
  422:       next;
  423:     }
  424:     my $text = $m->firstChild->nodeValue;
  425:     my $text_before_variable_replacement = $text;
  426:     my $var_key1 = 'dfhg3df54hg65hg4';
  427:     my $var_key2 = 'dfhg654d6f5g4h5f';
  428:     my $eval = defined $m->getAttribute('eval') && $m->getAttribute('eval') eq 'on';
  429:     if ($eval) {
  430:       # replace variables
  431:       foreach my $variable (@variables) {
  432:         my $replacement = $var_key1.$variable.$var_key2;
  433:         $text =~ s/\$$variable(?![a-zA-Z])/$replacement/ge;
  434:         $text =~ s/\$\{$variable\}/$replacement/ge;
  435:       }
  436:     }
  437:     # check if the expression is enclosed in math separators: $ $$ \( \) \[ \]
  438:     # if so, replace the whole node by dtm or tm
  439:     my $new_text;
  440:     my $new_node_name;
  441:     if ($text =~ /^\s*\$\$([^\$]*)\$\$\s*$/) {
  442:       $new_node_name = 'dtm';
  443:       $new_text = $1;
  444:     } elsif ($text =~ /^\s*\\\[(.*)\\\]\s*$/s) {
  445:       $new_node_name = 'dtm';
  446:       $new_text = $1;
  447:     } elsif ($text =~ /^\s*\$([^\$]*)\$\s*$/) {
  448:       $new_node_name = 'tm';
  449:       $new_text = $1;
  450:     } elsif ($text =~ /^\s*\\\((.*)\\\)\s*$/s) {
  451:       $new_node_name = 'tm';
  452:       $new_text = $1;
  453:     }
  454:     if (defined $new_node_name) {
  455:       if ($eval) {
  456:         foreach my $variable (@variables) {
  457:           my $replacement = $var_key1.$variable.$var_key2;
  458:           $new_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
  459:           $new_text =~ s/$replacement/\$$variable/g;
  460:         }
  461:       }
  462:       my $new_node = $doc->createElement($new_node_name);
  463:       if ($eval) {
  464:         $new_node->setAttribute('eval', 'on');
  465:       }
  466:       $new_node->appendChild($doc->createTextNode($new_text));
  467:       $m->parentNode->replaceChild($new_node, $m);
  468:       next;
  469:     }
  470:     if ($text !~ /\$|\\\(|\\\)|\\\[|\\\]/) {
  471:       # there are no math separators inside
  472:       # try to guess if this is meant as math
  473:       my $found_math = 0;
  474:       foreach my $symbol (@latex_math) {
  475:         if (index($text, $symbol) != -1) {
  476:           $found_math = 1;
  477:           last;
  478:         }
  479:       }
  480:       if ($found_math) {
  481:         # interpret the whole text as LaTeX inline math
  482:         my $new_node = $doc->createElement('tm');
  483:         if ($eval) {
  484:           $new_node->setAttribute('eval', 'on');
  485:         }
  486:         $new_node->appendChild($doc->createTextNode($text_before_variable_replacement));
  487:         $m->parentNode->replaceChild($new_node, $m);
  488:         next;
  489:       }
  490:       # no math symbol found, we will convert the text with tth
  491:     }
  492:     
  493:     # there are math separators inside, even after hiding variables, or there was no math symbol
  494:     
  495:     # hide math parts inside before running tth
  496:     my $math_key1 = '#ghjgdh5hg45gf';
  497:     my $math_key2 = '#';
  498:     my @maths = ();
  499:     my @separators = (['$$','$$'], ['\\(','\\)'], ['\\[','\\]'], ['$','$']);
  500:     foreach my $seps (@separators) {
  501:       my $sep1 = $seps->[0];
  502:       my $sep2 = $seps->[1];
  503:       my $pos1 = index($text, $sep1);
  504:       if ($pos1 == -1) {
  505:         next;
  506:       }
  507:       my $pos2 = index($text, $sep2, $pos1+length($sep1));
  508:       while ($pos1 != -1 && $pos2 != -1) {
  509:         my $replace = substr($text, $pos1, $pos2+length($sep2)-$pos1);
  510:         push(@maths, $replace);
  511:         my $by = $math_key1.scalar(@maths).$math_key2;
  512:         $text = substr($text, 0, $pos1).$by.substr($text, $pos2+length($sep2));
  513:         $pos1 = index($text, $sep1);
  514:         if ($pos1 != -1) {
  515:           $pos2 = index($text, $sep2, $pos1+length($sep1));
  516:         }
  517:       }
  518:     }
  519:     # get HTML as text from tth
  520:     my $html_text = tth($text);
  521:     # replace math by replacements
  522:     for (my $i=0; $i < scalar(@maths); $i++) {
  523:       my $math = $maths[$i];
  524:       $math =~ s/&/&amp;/g;
  525:       $math =~ s/</&lt;/g;
  526:       $math =~ s/>/&gt;/g;
  527:       if ($math =~ /^\$\$(.*)\$\$$/s) {
  528:         $math = '<dtm>'.$1.'</dtm>';
  529:       } elsif ($math =~ /^\\\[(.*)\\\]$/s) {
  530:         $math = '<dtm>'.$1.'</dtm>';
  531:       } elsif ($math =~ /^\\\((.*)\\\)$/s) {
  532:         $math = '<tm>'.$1.'</tm>';
  533:       } elsif ($math =~ /^\$(.*)\$$/s) {
  534:         $math = '<tm>'.$1.'</tm>';
  535:       }
  536:       my $replace = $math_key1.($i+1).$math_key2;
  537:       $html_text =~ s/$replace/$math/;
  538:     }
  539:     # replace variables if necessary
  540:     if ($eval) {
  541:       foreach my $variable (@variables) {
  542:         my $replacement = $var_key1.$variable.$var_key2;
  543:         $html_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
  544:         $html_text =~ s/$replacement/\$$variable/g;
  545:       }
  546:     }
  547:     my $fragment = html_to_dom($html_text);
  548:     $doc->adoptNode($fragment);
  549:     $m->parentNode->replaceChild($fragment, $m);
  550:     
  551:   }
  552: }
  553: 
  554: # Returns the HTML equivalent of LaTeX input, using tth
  555: sub tth {
  556:   my ($text) = @_;
  557:   my $output = &tth::tth($text);
  558:   my $errorstring = &tth::ttherror();
  559:   if ($errorstring) {
  560:     die $errorstring;
  561:   }
  562:   # hopefully the temp file will not be removed before this point (otherwise we should use unlink_on_destroy 0)
  563:   $output =~ s/^\s*|\s*$//;
  564:   $output =~ s/<div class="p"><!----><\/div>/<br\/>/; # why is tth using such ugly markup for \newline ?
  565:   return $output;
  566: }
  567: 
  568: # transform simple HTML into a DOM fragment (which will need to be adopted by the document)
  569: sub html_to_dom {
  570:   my ($text) = @_;
  571:   $text = '<root>'.$text.'</root>';
  572:   my $textref = Apache::html_to_xml::html_to_xml(\$text);
  573:   utf8::upgrade($$textref); # otherwise the XML parser fails when the HTML parser turns &nbsp; into a character
  574:   my $dom_doc = XML::LibXML->load_xml(string => $textref);
  575:   my $root = $dom_doc->documentElement;
  576:   remove_empty_style($root);
  577:   my $fragment = $dom_doc->createDocumentFragment();
  578:   my $next;
  579:   for (my $n=$root->firstChild; defined $n; $n=$next) {
  580:     $next = $n->nextSibling;
  581:     $root->removeChild($n);
  582:     $fragment->appendChild($n);
  583:   }
  584:   return($fragment);
  585: }
  586: 
  587: # Use the linked sty files to guess which newly defined elements should be considered blocks.
  588: # Also adds to @containing_styles the sty elements that contain styles.
  589: # @param {string} fn - the file path (we only extract the directory path from it)
  590: # @param {string} res_dir - path of res directory parent (without the / at the end)
  591: # @param {Element} root - the root element
  592: sub add_sty_blocks {
  593:   my ($fn, $res_dir, $root, $all_block) = @_;
  594:   my $doc = $root->ownerDocument;
  595:   my @parserlibs = $doc->getElementsByTagName('parserlib');
  596:   my @libs = ();
  597:   foreach my $parserlib (@parserlibs) {
  598:     if (defined $parserlib->firstChild && $parserlib->firstChild->nodeType == XML_TEXT_NODE) {
  599:       my $value = $parserlib->firstChild->nodeValue;
  600:       $value =~ s/^\s+|\s+$//g;
  601:       if ($value ne '') {
  602:         push(@libs, $value);
  603:       }
  604:     }
  605:   }
  606:   my ($name, $path, $suffix) = fileparse($fn);
  607:   foreach my $sty (@libs) {
  608:     if (substr($sty, 0, 1) eq '/') {
  609:       $sty = $res_dir.$sty;
  610:     } else {
  611:       $sty = $path.$sty;
  612:     }
  613:     my $new_elements = parse_sty($sty, $all_block);
  614:     better_guess($root, $new_elements, $all_block);
  615:     my $new_blocks = $new_elements->{'block'};
  616:     my $new_inlines = $new_elements->{'inline'};
  617:     push(@$all_block, @{$new_blocks});
  618:     #push(@inlines, @{$new_inlines}); # we are not using a list of inline elements at this point
  619:   }
  620: }
  621: 
  622: ##
  623: # Parses a sty file and returns lists of block and inline elements.
  624: # @param {string} fn - the file path
  625: ##
  626: sub parse_sty {
  627:   my ($fn, $all_block) = @_;
  628:   my @blocks = ();
  629:   my @inlines = ();
  630:   my $p = HTML::TokeParser->new($fn);
  631:   if (! $p) {
  632:     die "post_xml.pl: parse_sty: Error reading $fn\n";
  633:   }
  634:   $p->empty_element_tags(1);
  635:   my $in_definetag = 0;
  636:   my $in_render = 0;
  637:   my %newtags = ();
  638:   my $newtag = '';
  639:   my $is_block = 0;
  640:   while (my $token = $p->get_token) {
  641:     if ($token->[0] eq 'S') {
  642:       my $tag = lc($token->[1]);
  643:       if ($tag eq 'definetag') {
  644:         $in_definetag = 1;
  645:         $is_block = 0;
  646:         my $attributes = $token->[2];
  647:         $newtag = $attributes->{'name'};
  648:         if (substr($newtag, 0, 1) eq '/') {
  649:           $newtag = substr($newtag, 1);
  650:         }
  651:       } elsif ($in_definetag && $tag eq 'render') {
  652:         $in_render = 1;
  653:         $is_block = 0;
  654:       } elsif ($in_render) {
  655:         if (string_in_array($all_block, $tag)) {
  656:           $is_block = 1;
  657:         }
  658:       }
  659:     } elsif ($token->[0] eq 'E') {
  660:       my $tag = lc($token->[1]);
  661:       if ($tag eq 'definetag') {
  662:         $in_definetag = 0;
  663:         if (defined $newtags{$newtag}) {
  664:           $newtags{$newtag} = $newtags{$newtag} || $is_block;
  665:         } else {
  666:           $newtags{$newtag} = $is_block;
  667:         }
  668:       } elsif ($in_definetag && $tag eq 'render') {
  669:         $in_render = 0;
  670:       }
  671:     }
  672:   }
  673:   foreach $newtag (keys(%newtags)) {
  674:     if ($newtags{$newtag} == 1) {
  675:       push(@blocks, $newtag);
  676:     } else {
  677:       push(@inlines, $newtag);
  678:     }
  679:   }
  680:   return {'block'=>\@blocks, 'inline'=>\@inlines};
  681: }
  682: 
  683: ##
  684: # Marks as block the elements that contain block elements in the input file.
  685: # Also adds to @containing_styles the sty elements that contain styles.
  686: # @param {string} fn - the file path
  687: # @param {Hash<string,Array>} new_elements - contains arrays in 'block' and 'inline'
  688: ##
  689: sub better_guess {
  690:   my ($root, $new_elements, $all_block) = @_;
  691:   my $new_blocks = $new_elements->{'block'};
  692:   my $new_inlines = $new_elements->{'inline'};
  693:   
  694:   my @change = (); # change these elements from inline to block
  695:   foreach my $tag (@{$new_inlines}) {
  696:     my @nodes = $root->getElementsByTagName($tag);
  697:     NODE_LOOP: foreach my $node (@nodes) {
  698:       for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
  699:         if ($child->nodeType == XML_ELEMENT_NODE) {
  700:           if (string_in_array($all_block, $child->nodeName) || string_in_array($new_blocks, $child->nodeName)) {
  701:             push(@change, $tag);
  702:             last NODE_LOOP;
  703:           }
  704:         }
  705:       }
  706:     }
  707:   }
  708:   foreach my $inline (@change) {
  709:     my $index = 0;
  710:     $index++ until $new_inlines->[$index] eq $inline;
  711:     splice(@{$new_inlines}, $index, 1);
  712:     push(@{$new_blocks}, $inline);
  713:   }
  714:   # add to @containing_styles when a style is used inside
  715:   # NOTE: some sty elements will be added even though they should not, but if we don't do that
  716:   # all style will be removed in the sty elements.
  717:   foreach my $tag ((@{$new_blocks}, @{$new_inlines})) {
  718:     my @nodes = $root->getElementsByTagName($tag);
  719:     NODE_LOOP: foreach my $node (@nodes) {
  720:       for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
  721:         if ($child->nodeType == XML_ELEMENT_NODE) {
  722:           if (string_in_array(\@html_styles, $child->nodeName)) {
  723:             push(@containing_styles, $tag);
  724:             last NODE_LOOP;
  725:           }
  726:         }
  727:       }
  728:     }
  729:   }
  730: }
  731: 
  732: # When a style element contains a block, move the style inside the block where it is allowed.
  733: # style/block/other -> block/style/other
  734: # When a style is used where it is not allowed, move it inside its children or remove it (unless it contains only text)
  735: # element_not_containing_styles/style/other -> element_not_containing_styles/other/style (except if other is a style)
  736: # The fix is not perfect in the case of element_not_containing_styles/style1/style2/block/text (style1 will be lost):
  737: # element_not_containing_styles/style1/style2/block/text -> element_not_containing_styles/block/style2/text
  738: # (a solution to this problem would be to merge the styles in a span)
  739: # NOTE: .sty defined elements might have been added to @containing_styles by better_guess().
  740: sub fix_block_styles {
  741:   my ($element, $all_block) = @_;
  742:   my $doc = $element->ownerDocument;
  743:   if (string_in_array(\@html_styles, $element->nodeName)) {
  744:     # move spaces out of the style element
  745:     if (defined $element->firstChild && $element->firstChild->nodeType == XML_TEXT_NODE) {
  746:       my $child = $element->firstChild;
  747:       if ($child->nodeValue =~ /^(\s+)(\S.*)$/s) {
  748:         $element->parentNode->insertBefore($doc->createTextNode($1), $element);
  749:         $child->setData($2);
  750:       }
  751:     }
  752:     if (defined $element->lastChild && $element->lastChild->nodeType == XML_TEXT_NODE) {
  753:       my $child = $element->lastChild;
  754:       if ($child->nodeValue =~ /^(.*\S)(\s+)$/s) {
  755:         $element->parentNode->insertAfter($doc->createTextNode($2), $element);
  756:         $child->setData($1);
  757:       }
  758:     }
  759:     
  760:     my $found_block = 0;
  761:     for (my $child=$element->firstChild; defined $child; $child=$child->nextSibling) {
  762:       if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
  763:         $found_block = 1;
  764:         last;
  765:       }
  766:     }
  767:     my $no_style_here = !string_in_array(\@containing_styles, $element->parentNode->nodeName);
  768:     if ($found_block || $no_style_here) {
  769:       # there is a block or the style is not allowed here,
  770:       # the style element has to be replaced by its modified children
  771:       my $s; # a clone of the style
  772:       my $next;
  773:       for (my $child=$element->firstChild; defined $child; $child=$next) {
  774:         $next = $child->nextSibling;
  775:         if ($child->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $child->nodeName) ||
  776:             $child->nodeName eq 'br' || $no_style_here)) {
  777:           # avoid inverting a style with a style with $no_style_here (that would cause endless recursion)
  778:           if (!$no_style_here || (!string_in_array(\@html_styles, $child->nodeName) &&
  779:               string_in_array(\@containing_styles, $child->nodeName))) {
  780:             # block node or inline node when the style is not allowed:
  781:             # move all children inside the style, and make the style the only child
  782:             $s = $element->cloneNode();
  783:             my $next2;
  784:             for (my $child2=$child->firstChild; defined $child2; $child2=$next2) {
  785:               $next2 = $child2->nextSibling;
  786:               $child->removeChild($child2);
  787:               $s->appendChild($child2);
  788:             }
  789:             $child->appendChild($s);
  790:           }
  791:           $s = undef;
  792:         } elsif (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
  793:             $child->nodeType == XML_ELEMENT_NODE) {
  794:           # if the style is allowed, move text and inline nodes inside the style
  795:           if (!$no_style_here) {
  796:             if (!defined $s) {
  797:               $s = $element->cloneNode();
  798:               $element->insertBefore($s, $child);
  799:             }
  800:             $element->removeChild($child);
  801:             $s->appendChild($child);
  802:           }
  803:         } else {
  804:           # do not put other nodes inside the style
  805:           $s = undef;
  806:         }
  807:       }
  808:       # now replace by children and fix them
  809:       my $parent = $element->parentNode;
  810:       for (my $child=$element->firstChild; defined $child; $child=$next) {
  811:         $next = $child->nextSibling;
  812:         $element->removeChild($child);
  813:         $parent->insertBefore($child, $element);
  814:         if ($child->nodeType == XML_ELEMENT_NODE) {
  815:           fix_block_styles($child, $all_block);
  816:         }
  817:       }
  818:       $parent->removeChild($element);
  819:       return;
  820:     }
  821:   }
  822:   # otherwise fix all children
  823:   my $next;
  824:   for (my $child=$element->firstChild; defined $child; $child=$next) {
  825:     $next = $child->nextSibling;
  826:     if ($child->nodeType == XML_ELEMENT_NODE) {
  827:       fix_block_styles($child, $all_block);
  828:     }
  829:   }
  830: }
  831: 
  832: # removes empty font elements and font elements that contain at least one block element
  833: # replaces other font elements by equivalent span
  834: sub fix_fonts {
  835:   my ($root, $all_block) = @_;
  836:   my $doc = $root->ownerDocument;
  837:   my @fonts = $root->getElementsByTagName('font');
  838:   @fonts = reverse(@fonts); # to deal with the ancestor last in the case of font/font
  839:   foreach my $font (@fonts) {
  840:     my $block = 0;
  841:     for (my $child=$font->firstChild; defined $child; $child=$child->nextSibling) {
  842:       if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
  843:         $block = 1;
  844:         last;
  845:       }
  846:     }
  847:     if (!defined $font->firstChild || $block) {
  848:       # empty font or font containing block elements
  849:       # replace this node by its content
  850:       replace_by_children($font);
  851:     } else {
  852:       # replace by equivalent span
  853:       my $color = get_non_empty_attribute($font, 'color');
  854:       my $size = get_non_empty_attribute($font, 'size');
  855:       my $face = get_non_empty_attribute($font, 'face');
  856:       if (defined $face) {
  857:         $face =~ s/^,|,$//;
  858:       }
  859:       if (!defined $color && !defined $size && !defined $face) {
  860:         # useless font element: replace this node by its content
  861:         replace_by_children($font);
  862:         next;
  863:       }
  864:       my $replacement;
  865:       tie (my %properties, 'Tie::IxHash', ());
  866:       if (!defined $color && !defined $size && defined $face && lc($face) eq 'symbol') {
  867:         $replacement = $doc->createDocumentFragment();
  868:       } else {
  869:         $replacement = $doc->createElement('span');
  870:         my $css = '';
  871:         if (defined $color) {
  872:           $color =~ s/^x/#/;
  873:           $properties{'color'} = $color;
  874:         }
  875:         if (defined $size) {
  876:           my %hash = (
  877:             '1' => 'x-small',
  878:             '2' => 'small',
  879:             '3' => 'medium',
  880:             '4' => 'large',
  881:             '5' => 'x-large',
  882:             '6' => 'xx-large',
  883:             '7' => '300%',
  884:             '-1' => 'small',
  885:             '-2' => 'x-small',
  886:             '+1' => 'large',
  887:             '+2' => 'x-large',
  888:             '+3' => 'xx-large',
  889:             '+4' => '300%',
  890:           );
  891:           my $value = $hash{$size};
  892:           if (!defined $value) {
  893:             $value = 'medium';
  894:           }
  895:           $properties{'font-size'} = $value;
  896:         }
  897:         if (defined $face) {
  898:           if (lc($face) ne 'symbol' && lc($face) ne 'bold') {
  899:             $properties{'font-family'} = $face;
  900:           }
  901:         }
  902:         set_css_properties($replacement, \%properties);
  903:       }
  904:       if (defined $face && lc($face) eq 'symbol') {
  905:         # convert all content to unicode
  906:         my $next;
  907:         for (my $child=$font->firstChild; defined $child; $child=$next) {
  908:           $next = $child->nextSibling;
  909:           if ($child->nodeType == XML_TEXT_NODE) {
  910:             my $value = $child->nodeValue;
  911:             $value =~ tr/ABGDEZHQIKLMNXOPRSTUFCYWabgdezhqiklmnxoprVstufcywJjv¡«¬®/ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϕϖϒ↔←→/;
  912:             $child->setData($value);
  913:           }
  914:         }
  915:       }
  916:       # replace the font node
  917:       if ($replacement->nodeType == XML_ELEMENT_NODE && !defined $font->previousSibling &&
  918:           !defined $font->nextSibling && string_in_array(\@accepting_style, $font->parentNode->nodeName)) {
  919:         # use CSS on the parent block and replace font by its children instead of using a new element
  920:         set_css_properties($font->parentNode, \%properties);
  921:         replace_by_children($font);
  922:       } else {
  923:         # move all font children inside the replacement (span or fragment)
  924:         my $next;
  925:         for (my $child=$font->firstChild; defined $child; $child=$next) {
  926:           $next = $child->nextSibling;
  927:           $font->removeChild($child);
  928:           $replacement->appendChild($child);
  929:         }
  930:         # replace font
  931:         $font->parentNode->replaceChild($replacement, $font);
  932:       }
  933:     }
  934:   }
  935:   $root->normalize();
  936: }
  937: 
  938: # replaces u by <span style="text-decoration: underline">
  939: sub replace_u {
  940:   my ($root) = @_;
  941:   my $doc = $root->ownerDocument;
  942:   my @us = $root->getElementsByTagName('u');
  943:   foreach my $u (@us) {
  944:     my $span = $doc->createElement('span');
  945:     $span->setAttribute('style', 'text-decoration: underline');
  946:     my $next;
  947:     for (my $child=$u->firstChild; defined $child; $child=$next) {
  948:       $next = $child->nextSibling;
  949:       $u->removeChild($child);
  950:       $span->appendChild($child);
  951:     }
  952:     $u->parentNode->replaceChild($span, $u);
  953:   }
  954: }
  955: 
  956: # removes CDATA sections tags that have not been parsed correcty by the HTML parser
  957: # also removes bad comments in script elements
  958: sub remove_bad_cdata_sections {
  959:   my ($root) = @_;
  960:   my $doc = $root->ownerDocument;
  961:   foreach my $name (@preserve_elements) {
  962:     my @nodes = $root->getElementsByTagName($name);
  963:     foreach my $node (@nodes) {
  964:       if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
  965:         my $value = $node->firstChild->nodeValue;
  966:         if ($name eq 'script' && (!defined $node->getAttribute('type') || $node->getAttribute('type') ne 'loncapa/perl') &&
  967:             !defined $node->firstChild->nextSibling && $value =~ /^(\s*)<!--(.*)-->(\s*)$/) {
  968:           # web browsers interpret that as a real comment when it is on 1 line, but the Perl HTML parser thinks it is the script
  969:           # -> turning it back into a comment
  970:           # (this is only true for Javascript script elements, since LON-CAPA does not parse loncapa/perl scripts in the same way)
  971:           $node->removeChild($node->firstChild);
  972:           $node->appendChild($doc->createComment($2));
  973:           next;
  974:         }
  975:         # at the beginning:
  976:         $value =~ s/^(\s*)<!\[CDATA\[/$1/; # <![CDATA[
  977:         $value =~ s/^(\s*)\/\*\s*<!\[CDATA\[\s*\*\//$1/; # /* <![CDATA[ */
  978:         $value =~ s/^(\s*)\/\/\s*<!\[CDATA\[/$1/; # // <![CDATA[
  979:         $value =~ s/^(\s*)(\/\/)?\s*<!--/$1/; # // <!--
  980:         # at the end:
  981:         $value =~ s/\/\/\s*\]\]>(\s*)$/$1/; # // ]]>
  982:         $value =~ s/\]\]>(\s*)$/$1/; # ]]>
  983:         $value =~ s/(\/\/)?\s*-->(\s*)$/$2/; # // -->
  984:         $value =~ s/\/\*\s*\]\]>\s*\*\/(\s*)$/$1/; # /* ]]> */
  985:         
  986:         $value = "\n".$value."\n";
  987:         $value =~ s/\s*(\n[ \t]*)/$1/;
  988:         $value =~ s/\s+$/\n/;
  989:         $node->firstChild->setData($value);
  990:       }
  991:     }
  992:   }
  993: }
  994: 
  995: # adds CDATA sections to scripts
  996: sub add_cdata_sections {
  997:   my ($root) = @_;
  998:   my $doc = $root->ownerDocument;
  999:   my @scripts = $root->getElementsByTagName('script');
 1000:   my @answers = $root->getElementsByTagName('answer');
 1001:   foreach my $answer (@answers) {
 1002:     my $ancestor = $answer->parentNode;
 1003:     my $found_capa_response = 0;
 1004:     while (defined $ancestor) {
 1005:       if ($ancestor->nodeName eq 'numericalresponse' || $ancestor->nodeName eq 'formularesponse') {
 1006:         $found_capa_response = 1;
 1007:         last;
 1008:       }
 1009:       $ancestor = $ancestor->parentNode;
 1010:     }
 1011:     if (!$found_capa_response) {
 1012:       push(@scripts, $answer);
 1013:     }
 1014:   }
 1015:   foreach my $script (@scripts) {
 1016:     # use a CDATA section in the normal situation, for any script
 1017:     my $first = $script->firstChild;
 1018:     if (defined $first && $first->nodeType == XML_TEXT_NODE && !defined $first->nextSibling) {
 1019:       my $cdata = $doc->createCDATASection($first->nodeValue);
 1020:       $script->replaceChild($cdata, $first);
 1021:     }
 1022:   }
 1023: }
 1024: 
 1025: # removes "<!--" and "-->" at the beginning and end of style elements
 1026: sub fix_style_element {
 1027:   my ($root) = @_;
 1028:   my @styles = $root->getElementsByTagName('style');
 1029:   foreach my $style (@styles) {
 1030:     if (defined $style->firstChild && $style->firstChild->nodeType == XML_TEXT_NODE &&
 1031:         !defined $style->firstChild->nextSibling) {
 1032:       my $text = $style->firstChild->nodeValue;
 1033:       if ($text =~ /^\s*<!--(.*)-->\s*$/s) {
 1034:         $style->firstChild->setData($1);
 1035:       }
 1036:     }
 1037:   }
 1038: }
 1039: 
 1040: # create missing cells at the end of table rows
 1041: sub fix_tables {
 1042:   my ($root) = @_;
 1043:   my @tables = $root->getElementsByTagName('table');
 1044:   foreach my $table (@tables) {
 1045:     fix_cells($table);
 1046:     foreach my $tbody ($table->getChildrenByTagName('tbody')) {
 1047:       fix_cells($tbody);
 1048:     }
 1049:     foreach my $thead ($table->getChildrenByTagName('thead')) {
 1050:       fix_cells($thead);
 1051:     }
 1052:     foreach my $tfoot ($table->getChildrenByTagName('tfoot')) {
 1053:       fix_cells($tfoot);
 1054:     }
 1055:   }
 1056: }
 1057: 
 1058: # create missing cells at the end of table rows
 1059: sub fix_cells {
 1060:   my ($table) = @_; # could actually be table, tbody, thead or tfoot
 1061:   my $doc = $table->ownerDocument;
 1062:   my @nb_cells = ();
 1063:   my $max_nb_cells = 0;
 1064:   my @rowspans = ();
 1065:   my @trs = $table->getChildrenByTagName('tr');
 1066:   foreach my $tr (@trs) {
 1067:     my $nb_cells;
 1068:     if (defined $rowspans[0]) {
 1069:       $nb_cells = shift(@rowspans);
 1070:     } else {
 1071:       $nb_cells = 0;
 1072:     }
 1073:     for (my $cell=$tr->firstChild; defined $cell; $cell=$cell->nextSibling) {
 1074:       if ($cell->nodeName eq 'td' || $cell->nodeName eq 'th') {
 1075:         my $colspan = $cell->getAttribute('colspan');
 1076:         if (defined $colspan && $colspan =~ /^\s*[0-9]+\s*$/) {
 1077:           $nb_cells += $colspan;
 1078:         } else {
 1079:           $nb_cells++;
 1080:         }
 1081:         my $rowspan = $cell->getAttribute('rowspan');
 1082:         if (defined $rowspan && $rowspan =~ /^\s*[0-9]+\s*$/) {
 1083:           for (my $i=0; $i < $rowspan-1; $i++) {
 1084:             if (!defined $rowspans[$i]) {
 1085:               $rowspans[$i] = 1;
 1086:             } else {
 1087:               $rowspans[$i]++;
 1088:             }
 1089:           }
 1090:         }
 1091:       }
 1092:     }
 1093:     push(@nb_cells, $nb_cells);
 1094:     if ($nb_cells > $max_nb_cells) {
 1095:       $max_nb_cells = $nb_cells;
 1096:     }
 1097:   }
 1098:   foreach my $tr (@trs) {
 1099:     my $nb_cells = shift(@nb_cells);
 1100:     if ($nb_cells < $max_nb_cells) {
 1101:       for (1..($max_nb_cells - $nb_cells)) {
 1102:         $tr->appendChild($doc->createElement('td'));
 1103:       }
 1104:     }
 1105:   }
 1106: }
 1107: 
 1108: # replaces ul/ul by ul/li/ul and the same for ol (using the previous li if possible)
 1109: # also adds a ul element when a li has no ul/ol ancestor
 1110: sub fix_lists {
 1111:   my ($root) = @_;
 1112:   my $doc = $root->ownerDocument;
 1113:   my @uls = $root->getElementsByTagName('ul');
 1114:   my @ols = $root->getElementsByTagName('ol');
 1115:   my @lists = (@uls, @ols);
 1116:   foreach my $list (@lists) {
 1117:     my $next;
 1118:     for (my $child=$list->firstChild; defined $child; $child=$next) {
 1119:       $next = $child->nextSibling;
 1120:       if ($child->nodeType == XML_ELEMENT_NODE && string_in_array(['ul','ol'], $child->nodeName)) {
 1121:         my $previous = $child->previousNonBlankSibling(); # note: non-DOM method
 1122:         $list->removeChild($child);
 1123:         if (defined $previous && $previous->nodeType == XML_ELEMENT_NODE && $previous->nodeName eq 'li') {
 1124:           $previous->appendChild($child);
 1125:         } else {
 1126:           my $li = $doc->createElement('li');
 1127:           $li->appendChild($child);
 1128:           if (!defined $next) {
 1129:             $list->appendChild($li);
 1130:           } else {
 1131:             $list->insertBefore($li, $next);
 1132:           }
 1133:         }
 1134:       }
 1135:     }
 1136:   }
 1137:   my @lis = $root->getElementsByTagName('li');
 1138:   foreach my $li (@lis) {
 1139:     my $found_list_ancestor = 0;
 1140:     my $ancestor = $li->parentNode;
 1141:     while (defined $ancestor) {
 1142:       if ($ancestor->nodeName eq 'ul' || $ancestor->nodeName eq 'ol') {
 1143:         $found_list_ancestor = 1;
 1144:         last;
 1145:       }
 1146:       $ancestor = $ancestor->parentNode;
 1147:     }
 1148:     if (!$found_list_ancestor) {
 1149:       # replace li by ul and add li under ul
 1150:       my $ul = $doc->createElement('ul');
 1151:       $li->parentNode->insertBefore($ul, $li);
 1152:       $li->parentNode->removeChild($li);
 1153:       $ul->appendChild($li);
 1154:       # add all other li afterwards inside ul (there might be text nodes in-between)
 1155:       my $next = $ul->nextSibling;
 1156:       while (defined $next) {
 1157:         my $next_next = $next->nextSibling;
 1158:         if ($next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^\s*$/ &&
 1159:             defined $next_next && $next_next->nodeType == XML_ELEMENT_NODE && $next_next->nodeName eq 'li') {
 1160:           $next->parentNode->removeChild($next);
 1161:           $ul->appendChild($next);
 1162:           $next = $next_next;
 1163:           $next_next = $next_next->nextSibling;
 1164:         }
 1165:         if ($next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'li') {
 1166:           $next->parentNode->removeChild($next);
 1167:           $ul->appendChild($next);
 1168:         } else {
 1169:           last;
 1170:         }
 1171:         $next = $next_next;
 1172:       }
 1173:     }
 1174:   }
 1175: }
 1176: 
 1177: # Some "image" elements are actually img element with a wrong name. This renames them.
 1178: # Amazingly enough, "<image src=..." displays an image in some browsers
 1179: # ("image" has existed at some point as an experimental HTML element).
 1180: sub fix_wrong_name_for_img {
 1181:   my ($root) = @_;
 1182:   my @images = $root->getElementsByTagName('image');
 1183:   foreach my $image (@images) {
 1184:     if (!defined $image->getAttribute('src')) {
 1185:       next;
 1186:     }
 1187:     my $found_correct_ancestor = 0;
 1188:     my $ancestor = $image->parentNode;
 1189:     while (defined $ancestor) {
 1190:       if ($ancestor->nodeName eq 'drawimage' || $ancestor->nodeName eq 'imageresponse') {
 1191:         $found_correct_ancestor = 1;
 1192:         last;
 1193:       }
 1194:       $ancestor = $ancestor->parentNode;
 1195:     }
 1196:     if ($found_correct_ancestor) {
 1197:       next;
 1198:     }
 1199:     # this really has to be renamed "img"
 1200:     $image->setNodeName('img');
 1201:   }
 1202: }
 1203: 
 1204: # Replaces many deprecated attributes and replaces them by equivalent CSS when possible
 1205: sub replace_deprecated_attributes_by_css {
 1206:   my ($root) = @_;
 1207:   
 1208:   fix_deprecated_in_tables($root);
 1209:   
 1210:   fix_deprecated_in_table_rows($root);
 1211:   
 1212:   fix_deprecated_in_table_cells($root);
 1213:   
 1214:   fix_deprecated_in_lists($root);
 1215:   
 1216:   fix_deprecated_in_list_items($root);
 1217:   
 1218:   fix_deprecated_in_hr($root);
 1219:   
 1220:   fix_deprecated_in_img($root);
 1221:   
 1222:   fix_deprecated_in_body($root);
 1223:   
 1224:   fix_align_attribute($root);
 1225: }
 1226: 
 1227: # Replaces deprecated attributes in tables
 1228: sub fix_deprecated_in_tables {
 1229:   my ($root) = @_;
 1230:   my @tables = $root->getElementsByTagName('table');
 1231:   foreach my $table (@tables) {
 1232:     tie (my %new_properties, 'Tie::IxHash', ());
 1233:     my $align = $table->getAttribute('align');
 1234:     if (defined $align) {
 1235:       $table->removeAttribute('align');
 1236:       $align = lc(trim($align));
 1237:     }
 1238:     if ($table->parentNode->nodeName eq 'center' || (defined $align && $align eq 'center') ||
 1239:         (defined $table->parentNode->getAttribute('align') && $table->parentNode->getAttribute('align') eq 'center')) {
 1240:       $new_properties{'margin-left'} = 'auto';
 1241:       $new_properties{'margin-right'} = 'auto';
 1242:     }
 1243:     if (defined $align && ($align eq 'left' || $align eq 'right')) {
 1244:       $new_properties{'float'} = $align;
 1245:     }
 1246:     my $width = $table->getAttribute('width');
 1247:     if (defined $width) {
 1248:       $table->removeAttribute('width');
 1249:       $width = trim($width);
 1250:       if ($width =~ /^[0-9]+$/) {
 1251:         $width .= 'px';
 1252:       }
 1253:       if ($width ne '') {
 1254:         $new_properties{'width'} = $width;
 1255:       }
 1256:     }
 1257:     my $height = $table->getAttribute('height');
 1258:     if (defined $height) {
 1259:       $table->removeAttribute('height');
 1260:       # no replacement for table height
 1261:     }
 1262:     my $bgcolor = $table->getAttribute('bgcolor');
 1263:     if (defined $bgcolor) {
 1264:       $table->removeAttribute('bgcolor');
 1265:       $bgcolor = trim($bgcolor);
 1266:       $bgcolor =~ s/^x\s*//;
 1267:       if ($bgcolor ne '') {
 1268:         $new_properties{'background-color'} = $bgcolor;
 1269:       }
 1270:     }
 1271:     my $frame = $table->getAttribute('frame');
 1272:     if (defined $frame) {
 1273:       $table->removeAttribute('frame');
 1274:       $frame = lc(trim($frame));
 1275:       if ($frame eq 'void') {
 1276:         $new_properties{'border'} = 'none';
 1277:       } elsif ($frame eq 'above') {
 1278:         $new_properties{'border-top'} = '1px solid black';
 1279:       } elsif ($frame eq 'below') {
 1280:         $new_properties{'border-bottom'} = '1px solid black';
 1281:       } elsif ($frame eq 'hsides') {
 1282:         $new_properties{'border-top'} = '1px solid black';
 1283:         $new_properties{'border-bottom'} = '1px solid black';
 1284:       } elsif ($frame eq 'vsides') {
 1285:         $new_properties{'border-left'} = '1px solid black';
 1286:         $new_properties{'border-right'} = '1px solid black';
 1287:       } elsif ($frame eq 'lhs') {
 1288:         $new_properties{'border-left'} = '1px solid black';
 1289:       } elsif ($frame eq 'rhs') {
 1290:         $new_properties{'border-right'} = '1px solid black';
 1291:       } elsif ($frame eq 'box') {
 1292:         $new_properties{'border'} = '1px solid black';
 1293:       } elsif ($frame eq 'border') {
 1294:         $new_properties{'border'} = '1px solid black';
 1295:       }
 1296:     }
 1297:     if (scalar(keys %new_properties) > 0) {
 1298:       set_css_properties($table, \%new_properties);
 1299:     }
 1300:     # we can't replace the border attribute without creating a style block, but we can improve things like border="BORDER"
 1301:     my $border = $table->getAttribute('border');
 1302:     if (defined $border) {
 1303:       $border = trim($border);
 1304:       if ($border !~ /^\s*[0-9]+\s*(px)?\s*$/) {
 1305:         $table->setAttribute('border', '1');
 1306:       }
 1307:     }
 1308:   }
 1309:   
 1310: }
 1311: 
 1312: # Replaces deprecated attributes in tr elements
 1313: sub fix_deprecated_in_table_rows {
 1314:   my ($root) = @_;
 1315:   my @trs = $root->getElementsByTagName('tr');
 1316:   foreach my $tr (@trs) {
 1317:     my $old_properties = get_css_properties($tr);
 1318:     tie (my %new_properties, 'Tie::IxHash', ());
 1319:     my $bgcolor = $tr->getAttribute('bgcolor');
 1320:     if (defined $bgcolor) {
 1321:       $tr->removeAttribute('bgcolor');
 1322:       if (!defined $old_properties->{'background-color'}) {
 1323:         $bgcolor = trim($bgcolor);
 1324:         $bgcolor =~ s/^x\s*//;
 1325:         if ($bgcolor ne '') {
 1326:           $new_properties{'background-color'} = $bgcolor;
 1327:         }
 1328:       }
 1329:     }
 1330:     my $align = $tr->getAttribute('align');
 1331:     if (defined $align && $align !~ /\s*char\s*/i) {
 1332:       $tr->removeAttribute('align');
 1333:       if (!defined $old_properties->{'text-align'}) {
 1334:         $align = lc(trim($align));
 1335:         if ($align ne '') {
 1336:           $new_properties{'text-align'} = $align;
 1337:         }
 1338:       }
 1339:     }
 1340:     my $valign = $tr->getAttribute('valign');
 1341:     if (defined $valign) {
 1342:       $tr->removeAttribute('valign');
 1343:       if (!defined $old_properties->{'vertical-align'}) {
 1344:         $valign = lc(trim($valign));
 1345:         if ($valign ne '') {
 1346:           $new_properties{'vertical-align'} = $valign;
 1347:         }
 1348:       }
 1349:     }
 1350:     if (scalar(keys %new_properties) > 0) {
 1351:       set_css_properties($tr, \%new_properties);
 1352:     }
 1353:   }
 1354: }
 1355: 
 1356: # Replaces deprecated attributes in table cells (td and th)
 1357: sub fix_deprecated_in_table_cells {
 1358:   my ($root) = @_;
 1359:   my @tds = $root->getElementsByTagName('td');
 1360:   my @ths = $root->getElementsByTagName('th');
 1361:   my @cells = (@tds, @ths);
 1362:   foreach my $cell (@cells) {
 1363:     my $old_properties = get_css_properties($cell);
 1364:     tie (my %new_properties, 'Tie::IxHash', ());
 1365:     my $width = $cell->getAttribute('width');
 1366:     if (defined $width) {
 1367:       $cell->removeAttribute('width');
 1368:       if (!defined $old_properties->{'width'}) {
 1369:         $width = trim($width);
 1370:         if ($width =~ /^[0-9]+$/) {
 1371:           $width .= 'px';
 1372:         }
 1373:         if ($width ne '') {
 1374:           $new_properties{'width'} = $width;
 1375:         }
 1376:       }
 1377:     }
 1378:     my $height = $cell->getAttribute('height');
 1379:     if (defined $height) {
 1380:       $cell->removeAttribute('height');
 1381:       if (!defined $old_properties->{'height'}) {
 1382:         $height = trim($height);
 1383:         if ($height =~ /^[0-9]+$/) {
 1384:           $height .= 'px';
 1385:         }
 1386:         if ($height ne '') {
 1387:           $new_properties{'height'} = $height;
 1388:         }
 1389:       }
 1390:     }
 1391:     my $bgcolor = $cell->getAttribute('bgcolor');
 1392:     if (defined $bgcolor) {
 1393:       $cell->removeAttribute('bgcolor');
 1394:       if (!defined $old_properties->{'background-color'}) {
 1395:         $bgcolor = trim($bgcolor);
 1396:         $bgcolor =~ s/^x\s*//;
 1397:         if ($bgcolor ne '') {
 1398:           $new_properties{'background-color'} = $bgcolor;
 1399:         }
 1400:       }
 1401:     }
 1402:     my $align = $cell->getAttribute('align');
 1403:     if (defined $align && $align !~ /\s*char\s*/i) {
 1404:       $cell->removeAttribute('align');
 1405:       if (!defined $old_properties->{'text-align'}) {
 1406:         $align = lc(trim($align));
 1407:         if ($align ne '') {
 1408:           $new_properties{'text-align'} = $align;
 1409:         }
 1410:       }
 1411:     }
 1412:     my $valign = $cell->getAttribute('valign');
 1413:     if (defined $valign) {
 1414:       $cell->removeAttribute('valign');
 1415:       if (!defined $old_properties->{'vertical-align'}) {
 1416:         $valign = lc(trim($valign));
 1417:         if ($valign ne '') {
 1418:           $new_properties{'vertical-align'} = $valign;
 1419:         }
 1420:       }
 1421:     }
 1422:     if (scalar(keys %new_properties) > 0) {
 1423:       set_css_properties($cell, \%new_properties);
 1424:     }
 1425:   }
 1426: }
 1427: 
 1428: # Replaces deprecated attributes in lists (ul and ol)
 1429: sub fix_deprecated_in_lists {
 1430:   my ($root) = @_;
 1431:   my @uls = $root->getElementsByTagName('ul');
 1432:   my @ols = $root->getElementsByTagName('ol');
 1433:   my @lists = (@uls, @ols);
 1434:   foreach my $list (@lists) {
 1435:     my $type = $list->getAttribute('type');
 1436:     if (defined $type) {
 1437:       my $lst = list_style_type($type);
 1438:       if (defined $lst) {
 1439:         $list->removeAttribute('type');
 1440:         if (!defined get_css_property($list, 'list-style-type')) {
 1441:           set_css_property($list, 'list-style-type', $lst);
 1442:         }
 1443:       }
 1444:     }
 1445:   }
 1446: }
 1447: 
 1448: # Replaces deprecated attributes in list items (li)
 1449: sub fix_deprecated_in_list_items {
 1450:   my ($root) = @_;
 1451:   my @lis = $root->getElementsByTagName('li');
 1452:   foreach my $li (@lis) {
 1453:     my $type = $li->getAttribute('type');
 1454:     if (defined $type) {
 1455:       my $lst = list_style_type($type);
 1456:       if (defined $lst) {
 1457:         $li->removeAttribute('type');
 1458:         if (!defined get_css_property($li, 'list-style-type')) {
 1459:           set_css_property($li, 'list-style-type', $lst);
 1460:         }
 1461:       }
 1462:     }
 1463:   }
 1464: }
 1465: 
 1466: # returns the CSS list-style-type value equivalent to the given type attribute for a list or list item
 1467: sub list_style_type {
 1468:   my ($type) = @_;
 1469:   my $value;
 1470:   $type = trim($type);
 1471:   if (lc($type) eq 'circle') {
 1472:     $value = 'circle';
 1473:   } elsif (lc($type) eq 'disc') {
 1474:     $value = 'disc';
 1475:   } elsif (lc($type) eq 'square') {
 1476:     $value = 'square';
 1477:   } elsif ($type eq 'a') {
 1478:     $value = 'lower-latin';
 1479:   } elsif ($type eq 'A') {
 1480:     $value = 'upper-latin';
 1481:   } elsif ($type eq 'i') {
 1482:     $value = 'lower-roman';
 1483:   } elsif ($type eq 'I') {
 1484:     $value = 'upper-roman';
 1485:   } elsif ($type eq '1') {
 1486:     $value = 'decimal';
 1487:   }
 1488:   return $value;
 1489: }
 1490: 
 1491: # Replaces deprecated attributes in hr
 1492: sub fix_deprecated_in_hr {
 1493:   my ($root) = @_;
 1494:   my @hrs = $root->getElementsByTagName('hr');
 1495:   foreach my $hr (@hrs) {
 1496:     tie (my %new_properties, 'Tie::IxHash', ());
 1497:     my $align = $hr->getAttribute('align');
 1498:     if (defined $align) {
 1499:       $align = lc(trim($align));
 1500:       if ($align eq 'left') {
 1501:         $new_properties{'text-align'} = 'left';
 1502:         $new_properties{'margin-left'} = '0';
 1503:       } elsif ($align eq 'right') {
 1504:         $new_properties{'text-align'} = 'right';
 1505:         $new_properties{'margin-right'} = '0';
 1506:       }
 1507:       $hr->removeAttribute('align');
 1508:     }
 1509:     my $color = $hr->getAttribute('color');
 1510:     if (defined $color) {
 1511:       $color = trim($color);
 1512:       $color =~ s/^x\s*//;
 1513:       if ($color ne '') {
 1514:         $new_properties{'color'} = $color;
 1515:         $new_properties{'background-color'} = $color;
 1516:       }
 1517:       $hr->removeAttribute('color');
 1518:     }
 1519:     my $noshade = $hr->getAttribute('noshade');
 1520:     my $size = $hr->getAttribute('size');
 1521:     if (defined $noshade) {
 1522:       $new_properties{'border-width'} = '0';
 1523:       if (!defined $color) {
 1524:         $new_properties{'color'} = 'gray';
 1525:         $new_properties{'background-color'} = 'gray';
 1526:       }
 1527:       if (!defined $size) {
 1528:         $size = '2';
 1529:       }
 1530:       $hr->removeAttribute('noshade');
 1531:     }
 1532:     if (defined $size) {
 1533:       $size = trim($size);
 1534:       if ($size ne '') {
 1535:         $new_properties{'height'} = $size.'px';
 1536:       }
 1537:       if (defined $hr->getAttribute('size')) {
 1538:         $hr->removeAttribute('size');
 1539:       }
 1540:     }
 1541:     my $width = $hr->getAttribute('width');
 1542:     if (defined $width) {
 1543:       $width = trim($width);
 1544:       if ($width ne '') {
 1545:         if ($width !~ /\%$/) {
 1546:           $width .= 'px';
 1547:         }
 1548:         $new_properties{'width'} = $width;
 1549:       }
 1550:       $hr->removeAttribute('width');
 1551:     }
 1552:     if (scalar(keys %new_properties) > 0) {
 1553:       set_css_properties($hr, \%new_properties);
 1554:     }
 1555:   }
 1556: }
 1557: 
 1558: # Replaces deprecated attributes in img
 1559: sub fix_deprecated_in_img {
 1560:   my ($root) = @_;
 1561:   my @imgs = $root->getElementsByTagName('img');
 1562:   foreach my $img (@imgs) {
 1563:     my $old_properties = get_css_properties($img);
 1564:     tie (my %new_properties, 'Tie::IxHash', ());
 1565:     my $align = $img->getAttribute('align');
 1566:     if (defined $align) {
 1567:       $align = lc(trim($align));
 1568:       if ($align eq 'middle' || $align eq 'top' || $align eq 'bottom') {
 1569:         $img->removeAttribute('align');
 1570:         if (!defined $old_properties->{'vertical-align'}) {
 1571:           $new_properties{'vertical-align'} = $align;
 1572:         }
 1573:       } elsif ($align eq 'left' || $align eq 'right') {
 1574:         $img->removeAttribute('align');
 1575:         if (!defined $old_properties->{'float'}) {
 1576:           $new_properties{'float'} = $align;
 1577:         }
 1578:       } elsif ($align eq 'center' || $align eq '') {
 1579:         $img->removeAttribute('align');
 1580:       }
 1581:     }
 1582:     my $border = $img->getAttribute('border');
 1583:     if (defined $border) {
 1584:       $border = lc(trim($border));
 1585:       if ($border =~ /^[0-9]+\s*(px)?$/) {
 1586:         $img->removeAttribute('border');
 1587:         if (!defined $old_properties->{'border'}) {
 1588:           if ($border !~ /px$/) {
 1589:             $border .= 'px';
 1590:           }
 1591:           $new_properties{'border'} = $border.' solid black';
 1592:         }
 1593:       }
 1594:     }
 1595:     my $hspace = $img->getAttribute('hspace');
 1596:     if (defined $hspace) {
 1597:       $hspace = lc(trim($hspace));
 1598:       if ($hspace =~ /^[0-9]+\s*(px)?$/) {
 1599:         $img->removeAttribute('hspace');
 1600:         if (!defined $old_properties->{'margin-left'} || !defined $old_properties->{'margin-right'}) {
 1601:           if ($hspace !~ /px$/) {
 1602:             $hspace .= 'px';
 1603:           }
 1604:           $new_properties{'margin-left'} = $hspace;
 1605:           $new_properties{'margin-right'} = $hspace;
 1606:         }
 1607:       }
 1608:     }
 1609:     if (scalar(keys %new_properties) > 0) {
 1610:       set_css_properties($img, \%new_properties);
 1611:     }
 1612:   }
 1613: }
 1614: 
 1615: # Replaces deprecated attributes in htmlbody (the style attribute could be used in a div for output)
 1616: sub fix_deprecated_in_body {
 1617:   my ($root) = @_;
 1618:   my $doc = $root->ownerDocument;
 1619:   my @bodies = $root->getElementsByTagName('htmlbody');
 1620:   foreach my $body (@bodies) {
 1621:     my $old_properties = get_css_properties($body);
 1622:     tie (my %new_properties, 'Tie::IxHash', ());
 1623:     my $bgcolor = $body->getAttribute('bgcolor');
 1624:     if (defined $bgcolor) {
 1625:       $body->removeAttribute('bgcolor');
 1626:       if (!defined $old_properties->{'background-color'}) {
 1627:         $bgcolor = trim($bgcolor);
 1628:         $bgcolor =~ s/^x\s*//;
 1629:         if ($bgcolor ne '') {
 1630:           $new_properties{'background-color'} = $bgcolor;
 1631:         }
 1632:       }
 1633:     }
 1634:     my $color = $body->getAttribute('text');
 1635:     if (defined $color) {
 1636:       $body->removeAttribute('text');
 1637:       if (!defined $old_properties->{'color'}) {
 1638:         $color = trim($color);
 1639:         $color =~ s/^x\s*//;
 1640:         if ($color ne '') {
 1641:           $new_properties{'color'} = $color;
 1642:         }
 1643:       }
 1644:     }
 1645:     my $background = $body->getAttribute('background');
 1646:     if (defined $background && ($background =~ /\.jpe?g$|\.gif|\.png/i)) {
 1647:       $body->removeAttribute('background');
 1648:       if (!defined $old_properties->{'background-image'}) {
 1649:         $background = trim($background);
 1650:         if ($background ne '') {
 1651:           $new_properties{'background-image'} = 'url('.$background.')';
 1652:         }
 1653:       }
 1654:     }
 1655:     # NOTE: these attributes have never been standard and are better removed with no replacement
 1656:     foreach my $bad ('bottommargin', 'leftmargin', 'rightmargin', 'topmargin', 'marginheight', 'marginwidth') {
 1657:       if ($body->hasAttribute($bad)) {
 1658:         $body->removeAttribute($bad);
 1659:       }
 1660:     }
 1661:     # NOTE: link alink and vlink require a <style> block to be converted
 1662:     my $link = $body->getAttribute('link');
 1663:     my $alink = $body->getAttribute('alink');
 1664:     my $vlink = $body->getAttribute('vlink');
 1665:     if (defined $link || defined $alink || defined $vlink) {
 1666:       my $head;
 1667:       my @heads = $root->getElementsByTagName('htmlhead');
 1668:       if (scalar(@heads) > 0) {
 1669:         $head = $heads[0];
 1670:       } else {
 1671:         $head = $doc->createElement('htmlhead');
 1672:         $root->insertBefore($head, $root->firstChild);
 1673:       }
 1674:       my $style = $doc->createElement('style');
 1675:       $head->appendChild($style);
 1676:       my $css = "\n";
 1677:       if (defined $link) {
 1678:         $body->removeAttribute('link');
 1679:         $link = trim($link);
 1680:         $link =~ s/^x\s*//;
 1681:         $css .= '      a:link { color:'.$link.' }';
 1682:         $css .= "\n";
 1683:       }
 1684:       if (defined $alink) {
 1685:         $body->removeAttribute('alink');
 1686:         $alink = trim($alink);
 1687:         $alink =~ s/^x\s*//;
 1688:         $css .= '      a:active { color:'.$alink.' }';
 1689:         $css .= "\n";
 1690:       }
 1691:       if (defined $vlink) {
 1692:         $body->removeAttribute('vlink');
 1693:         $vlink = trim($vlink);
 1694:         $vlink =~ s/^x\s*//;
 1695:         $css .= '      a:visited { color:'.$vlink.' }';
 1696:         $css .= "\n";
 1697:       }
 1698:       $css .= '    ';
 1699:       $style->appendChild($doc->createTextNode($css));
 1700:     }
 1701:     if (scalar(keys %new_properties) > 0) {
 1702:       set_css_properties($body, \%new_properties);
 1703:     } elsif (!$body->hasAttributes) {
 1704:       $body->parentNode->removeChild($body);
 1705:     }
 1706:   }
 1707: }
 1708: 
 1709: # replaces <div align="center"> by <div style="text-align:center;">
 1710: # also for p and h1..h6
 1711: sub fix_align_attribute {
 1712:   my ($root) = @_;
 1713:   my @nodes = $root->getElementsByTagName('div');
 1714:   push(@nodes, $root->getElementsByTagName('p'));
 1715:   for (my $i=1; $i<=6; $i++) {
 1716:     push(@nodes, $root->getElementsByTagName('h'.$i));
 1717:   }
 1718:   foreach my $node (@nodes) {
 1719:     my $align = $node->getAttribute('align');
 1720:     if (defined $align) {
 1721:       $node->removeAttribute('align');
 1722:       $align = trim($align);
 1723:       if ($align ne '' && !defined get_css_property($node, 'text-align')) {
 1724:         set_css_property($node, 'text-align', lc($align));
 1725:       }
 1726:     }
 1727:   }
 1728: }
 1729: 
 1730: # replace center by a div or remove it if there is a table inside
 1731: sub replace_center {
 1732:   my ($root, $all_block) = @_;
 1733:   my $doc = $root->ownerDocument;
 1734:   my @centers = $root->getElementsByTagName('center');
 1735:   foreach my $center (@centers) {
 1736:     if ($center->getChildrenByTagName('table')->size() > 0) { # note: getChildrenByTagName is not DOM (LibXML specific)
 1737:       replace_by_children($center);
 1738:     } else {
 1739:       if ((!defined $center->previousSibling ||
 1740:           ($center->previousSibling->nodeType == XML_TEXT_NODE && $center->previousSibling->nodeValue =~ /^\s*$/ && !defined $center->previousSibling->previousSibling)) &&
 1741:           (!defined $center->nextSibling ||
 1742:           ($center->nextSibling->nodeType == XML_TEXT_NODE && $center->nextSibling->nodeValue =~ /^\s*$/ && !defined $center->nextSibling->nextSibling)) &&
 1743:           string_in_array(\@accepting_style, $center->parentNode->nodeName)) {
 1744:         # use CSS on the parent block and replace center by its children
 1745:         set_css_property($center->parentNode, 'text-align', 'center');
 1746:         replace_by_children($center);
 1747:       } else {
 1748:         # use p or div ? check if there is a block inside
 1749:         my $found_block = 0;
 1750:         for (my $child=$center->firstChild; defined $child; $child=$child->nextSibling) {
 1751:           if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
 1752:             $found_block = 1;
 1753:             last;
 1754:           }
 1755:         }
 1756:         my $new_node;
 1757:         if ($found_block) {
 1758:           $new_node = $doc->createElement('div');
 1759:           $new_node->setAttribute('style', 'text-align: center; margin: 0 auto');
 1760:         } else {
 1761:           $new_node = $doc->createElement('p');
 1762:           $new_node->setAttribute('style', 'text-align: center');
 1763:         }
 1764:         my $next;
 1765:         for (my $child=$center->firstChild; defined $child; $child=$next) {
 1766:           $next = $child->nextSibling;
 1767:           $center->removeChild($child);
 1768:           $new_node->appendChild($child);
 1769:         }
 1770:         $center->parentNode->replaceChild($new_node, $center);
 1771:       }
 1772:     }
 1773:   }
 1774: }
 1775: 
 1776: # replaces <nobr> by <span style="white-space:nowrap">
 1777: sub replace_nobr {
 1778:   my ($root) = @_;
 1779:   my @nobrs = $root->getElementsByTagName('nobr');
 1780:   foreach my $nobr (@nobrs) {
 1781:     if (!defined $nobr->previousSibling && !defined $nobr->nextSibling &&
 1782:         string_in_array(\@accepting_style, $nobr->parentNode->nodeName)) {
 1783:       # use CSS on the parent block
 1784:       set_css_property($nobr->parentNode, 'white-space', 'nowrap');
 1785:       replace_by_children($nobr);
 1786:     } else {
 1787:       $nobr->setNodeName('span');
 1788:       $nobr->setAttribute('style', 'white-space:nowrap');
 1789:     }
 1790:   }
 1791: }
 1792: 
 1793: # removes notsolved tags in the case <hintgroup showoncorrect="no"><notsolved>...</notsolved></hintgroup>
 1794: # and in the case <notsolved><hintgroup showoncorrect="no">...</hintgroup></notsolved>
 1795: sub remove_useless_notsolved {
 1796:   my ($root) = @_;
 1797:   my @hintgroups = $root->getElementsByTagName('hintgroup');
 1798:   foreach my $hintgroup (@hintgroups) {
 1799:     my $showoncorrect = get_non_empty_attribute($hintgroup, 'showoncorrect');
 1800:     if (!defined $showoncorrect || $showoncorrect eq 'no') {
 1801:       my @notsolveds = $hintgroup->getElementsByTagName('notsolved');
 1802:       foreach my $notsolved (@notsolveds) {
 1803:         replace_by_children($notsolved);
 1804:       }
 1805:     }
 1806:     my $parent = $hintgroup->parentNode;
 1807:     if ($parent->nodeName eq 'notsolved' && scalar(@{$parent->nonBlankChildNodes()}) == 1) {
 1808:       replace_by_children($parent);
 1809:     }
 1810:   }
 1811: }
 1812: 
 1813: # adds a paragraph inside if needed and calls fix_paragraph for all paragraphs (including new ones)
 1814: sub fix_paragraphs_inside {
 1815:   my ($node, $all_block) = @_;
 1816:   # blocks in which paragrahs will be added:
 1817:   my @blocks_with_p = ('loncapa','library','problem','part','problemtype','window','block','while','postanswerdate','preduedate','solved','notsolved','languageblock','instructorcomment','togglebox','standalone','body','form');
 1818:   my @fix_p_if_br_or_p = (@responses,'foil','item','text','label','hintgroup','hintpart','hint','web','windowlink','div','li','dd','td','th','blockquote');
 1819:   if ((string_in_array(\@blocks_with_p, $node->nodeName) && paragraph_needed($node)) ||
 1820:       (string_in_array(\@fix_p_if_br_or_p, $node->nodeName) && paragraph_inside($node))) {
 1821:     # if non-empty, add paragraphs where needed between all br and remove br
 1822:     # (it would be easier to just put everything in a p and fix it afterwards, but there are performance issues
 1823:     #  when a paragraph has many blocks directly inside)
 1824:     my $doc = $node->ownerDocument;
 1825:     my $p = undef;
 1826:     my @new_children = ();
 1827:     my $next;
 1828:     for (my $child=$node->firstChild; defined $child; $child=$next) {
 1829:       $next = $child->nextSibling;
 1830:       $node->removeChild($child);
 1831:       if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'br') {
 1832:         if (defined $p) {
 1833:           push(@new_children, $p);
 1834:         } else {
 1835:           push(@new_children, $doc->createElement('p'));
 1836:         }
 1837:         $p = undef;
 1838:       } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array(\@inline_like_block, $child->nodeName)) {
 1839:         # inline_like_block: use the paragraph if there is one, otherwise do not create one
 1840:         if (defined $p) {
 1841:           $p->appendChild($child);
 1842:         } else {
 1843:           push(@new_children, $child);
 1844:         }
 1845:       } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
 1846:         # these children are blocks and should not be in a paragraph
 1847:         if (defined $p) {
 1848:           push(@new_children, $p);
 1849:           $p = undef;
 1850:         }
 1851:         push(@new_children, $child);
 1852:       } elsif ($child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^[ \t\f\n\r]*$/) {
 1853:         # blank text: add to paragraph if there is one and there is a next node, otherwise keep out of the paragraph
 1854:         if (defined $p) {
 1855:           if (defined $next) {
 1856:             $p->appendChild($child);
 1857:           } else {
 1858:             push(@new_children, $p);
 1859:             $p = undef;
 1860:             push(@new_children, $child);
 1861:           }
 1862:         } else {
 1863:           push(@new_children, $child);
 1864:         }
 1865:       } elsif ($child->nodeType == XML_TEXT_NODE ||
 1866:             $child->nodeType == XML_ELEMENT_NODE || $child->nodeType == XML_CDATA_SECTION_NODE ||
 1867:             $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
 1868:         # these children require a paragraph
 1869:         if (!defined $p) {
 1870:           $p = $doc->createElement('p');
 1871:         }
 1872:         $p->appendChild($child);
 1873:       } else {
 1874:         # these children do not require a paragraph (XML comments, PI)
 1875:         # -> do not move them in a new paragraph
 1876:         if (defined $p) {
 1877:           push(@new_children, $p);
 1878:           $p = undef;
 1879:         }
 1880:         push(@new_children, $child);
 1881:       }
 1882:     }
 1883:     if (defined $p) {
 1884:       push(@new_children, $p);
 1885:     }
 1886:     foreach my $child (@new_children) {
 1887:       $node->appendChild($child);
 1888:     }
 1889:   }
 1890:   # now fix the paragraphs everywhere, so that all inline nodes are inside a paragraph, and block nodes are outside
 1891:   my $next;
 1892:   for (my $child=$node->firstChild; defined $child; $child=$next) {
 1893:     $next = $child->nextSibling;
 1894:     if ($child->nodeType == XML_ELEMENT_NODE && defined $child->firstChild) {
 1895:       if ($child->nodeName eq 'p') {
 1896:         fix_paragraph($child, $all_block);
 1897:       } else {
 1898:         fix_paragraphs_inside($child, $all_block);
 1899:       }
 1900:     }
 1901:   }
 1902: }
 1903: 
 1904: # returns 1 if a paragraph is needed inside this node (assuming the parent can have paragraphs)
 1905: sub paragraph_needed {
 1906:   my ($node) = @_;
 1907:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 1908:     if (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
 1909:         ($child->nodeType == XML_ELEMENT_NODE && !string_in_array(\@inline_like_block, $child->nodeName)) ||
 1910:         $child->nodeType == XML_CDATA_SECTION_NODE ||
 1911:         $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
 1912:       return(1);
 1913:     }
 1914:   }
 1915:   return(0);
 1916: }
 1917: 
 1918: # returns 1 if there is a paragraph or br in a child of this node, or inside an inline child
 1919: sub paragraph_inside {
 1920:   my ($node) = @_;
 1921:   # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
 1922:   # (also used in first_block below)
 1923:   my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
 1924:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 1925:     if ($child->nodeType == XML_ELEMENT_NODE) {
 1926:       my $name = $child->nodeName;
 1927:       if ($name eq 'p' || $name eq 'br') {
 1928:         return(1);
 1929:       } elsif (string_in_array(\@splitable_inline, $name)) {
 1930:         if (paragraph_inside($child)) {
 1931:           return(1);
 1932:         }
 1933:       }
 1934:     }
 1935:   }
 1936:   return(0);
 1937: }
 1938: 
 1939: # fixes paragraphs inside paragraphs (without a block in-between)
 1940: sub fix_paragraph {
 1941:   my ($p, $all_block) = @_;
 1942:   my $loop_right = 1; # this loops is to avoid out of memory errors with recurse, see below
 1943:   while ($loop_right) {
 1944:     $loop_right = 0;
 1945:     my $block = find_first_block($p, $all_block);
 1946:     if (defined $block) {
 1947:       my $trees = clone_ancestor_around_node($p, $block);
 1948:       my $doc = $p->ownerDocument;
 1949:       my $replacement = $doc->createDocumentFragment();
 1950:       my $left = $trees->{'left'};
 1951:       my $middle = $trees->{'middle'};
 1952:       my $right = $trees->{'right'};
 1953:       my $left_needs_p = 0; # 1 if it needs a paragraph (used to replace br later)
 1954:       
 1955:       if (defined $left) {
 1956:         # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
 1957:         for (my $child=$left->firstChild; defined $child; $child=$child->nextSibling) {
 1958:           if ($child->nodeType == XML_ELEMENT_NODE) {
 1959:             fix_paragraphs_inside($child, $all_block);
 1960:           }
 1961:         }
 1962:         if (!paragraph_needed($left)) {
 1963:           # this was just blank text, comments or inline responses, it should not create a new paragraph
 1964:           my $next;
 1965:           for (my $child=$left->firstChild; defined $child; $child=$next) {
 1966:             $next = $child->nextSibling;
 1967:             $left->removeChild($child);
 1968:             $replacement->appendChild($child);
 1969:           }
 1970:         } else {
 1971:           $left_needs_p = 1;
 1972:           $replacement->appendChild($left);
 1973:         }
 1974:       }
 1975:       
 1976:       my $n = $middle->firstChild;
 1977:       while (defined $n) {
 1978:         if ($n->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $n->nodeName) || $n->nodeName eq 'br')) {
 1979:           if ($n->nodeName eq 'p') {
 1980:             my $parent = $n->parentNode;
 1981:             # first apply recursion
 1982:             fix_paragraph($n, $all_block);
 1983:             # now the p might have been replaced by several nodes, which should replace the initial p
 1984:             my $next_block;
 1985:             for (my $block=$parent->firstChild; defined $block; $block=$next_block) {
 1986:               $next_block = $block->nextSibling;
 1987:               if ($block->nodeName eq 'p') {
 1988:                 $parent->removeChild($block);
 1989:                 # for each parent before $middle, clone in-between the p and its children (to preserve the styles)
 1990:                 if (defined $block->firstChild) {
 1991:                   for (my $p=$parent; $p!=$middle; $p=$p->parentNode) {
 1992:                     my $newp = $p->cloneNode(0);
 1993:                     my $next;
 1994:                     for (my $child=$block->firstChild; defined $child; $child=$next) {
 1995:                       $next = $child->nextSibling;
 1996:                       $block->removeChild($child);
 1997:                       $newp->appendChild($child);
 1998:                     }
 1999:                     $block->appendChild($newp);
 2000:                   }
 2001:                 }
 2002:               }
 2003:               $replacement->appendChild($block);
 2004:             }
 2005:           } else {
 2006:             # replace the whole p by this block, forgetting about intermediate inline elements
 2007:             $n->parentNode->removeChild($n);
 2008:             if ($n->nodeName eq 'br') {
 2009:               # replace a br by a paragraph if there was nothing before in the paragraph,
 2010:               # otherwise remove it because it already broke the paragraph in half
 2011:               if (!defined $left || !$left_needs_p) {
 2012:                 $replacement->appendChild($middle);
 2013:               }
 2014:             } else {
 2015:               fix_paragraphs_inside($n, $all_block);
 2016:               $replacement->appendChild($n);
 2017:             }
 2018:           }
 2019:           last;
 2020:         }
 2021:         $n = $n->firstChild;
 2022:         if (defined $n && defined $n->nextSibling) {
 2023:           die "Error in post_xml.fix_paragraph: block not found";
 2024:         }
 2025:       }
 2026:       
 2027:       if (defined $right) {
 2028:         if ($block->nodeName eq 'p') {
 2029:           # remove attributes on the right paragraph
 2030:           my @attributelist = $right->attributes();
 2031:           foreach my $att (@attributelist) {
 2032:             $right->removeAttribute($att->nodeName);
 2033:           }
 2034:         }
 2035:         if ($right->firstChild->nodeType == XML_TEXT_NODE && $right->firstChild->nodeValue =~ /^[ \t\f\n\r]*$/) {
 2036:           # remove the first text node with whitespace only from the p, it should not trigger the creation of a p
 2037:           # (but take nbsp into account, so we should not use \s here)
 2038:           my $first = $right->firstChild;
 2039:           $right->removeChild($first);
 2040:           $replacement->appendChild($first);
 2041:         }
 2042:         if (defined $right->firstChild) {
 2043:           if (paragraph_needed($right)) {
 2044:             $replacement->appendChild($right);
 2045:             #fix_paragraph($right, $all_block); This is taking way too much memory for blocks with many children
 2046:             # -> loop instead of recurse
 2047:             $loop_right = 1;
 2048:           } else {
 2049:             # this was just blank text, comments or inline responses, it should not create a new paragraph
 2050:             my $next;
 2051:             for (my $child=$right->firstChild; defined $child; $child=$next) {
 2052:               $next = $child->nextSibling;
 2053:               $right->removeChild($child);
 2054:               $replacement->appendChild($child);
 2055:               # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
 2056:               if ($child->nodeType == XML_ELEMENT_NODE) {
 2057:                 fix_paragraphs_inside($child, $all_block);
 2058:               }
 2059:             }
 2060:           }
 2061:         }
 2062:       }
 2063:       
 2064:       $p->parentNode->replaceChild($replacement, $p);
 2065:       
 2066:       if ($loop_right) {
 2067:         $p = $right;
 2068:       }
 2069:       
 2070:     } else {
 2071:       # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
 2072:       my $next;
 2073:       for (my $child=$p->firstChild; defined $child; $child=$next) {
 2074:         $next = $child->nextSibling;
 2075:         if ($child->nodeType == XML_ELEMENT_NODE) {
 2076:           fix_paragraphs_inside($child, $all_block);
 2077:         }
 2078:       }
 2079:     }
 2080:   }
 2081: }
 2082: 
 2083: sub find_first_block {
 2084:   my ($node, $all_block) = @_;
 2085:   # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
 2086:   my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
 2087:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 2088:     if ($child->nodeType == XML_ELEMENT_NODE) {
 2089:       if (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br') {
 2090:         return($child);
 2091:       }
 2092:       if (string_in_array(\@splitable_inline, $child->nodeName)) {
 2093:         my $block = find_first_block($child, $all_block);
 2094:         if (defined $block) {
 2095:           return($block);
 2096:         }
 2097:       }
 2098:     }
 2099:   }
 2100:   return(undef);
 2101: }
 2102: 
 2103: # Creates clones of the ancestor containing the descendants before the node, at the node, and after the node.
 2104: # returns a hash with: left, middle, right (left and right can be undef)
 2105: sub clone_ancestor_around_node {
 2106:   my ($ancestor, $node) = @_;
 2107:   my $middle_node;
 2108:   my ($left, $middle, $right);
 2109:   for (my $child=$ancestor->firstChild; defined $child; $child=$child->nextSibling) {
 2110:     if ($child == $node || is_ancestor_of($child, $node)) {
 2111:       $middle_node = $child;
 2112:       last;
 2113:     }
 2114:   }
 2115:   if (!defined $middle_node) {
 2116:     die "error in split_ancestor_around_node: middle not found";
 2117:   }
 2118:   if (defined $middle_node->previousSibling) {
 2119:     $left = $ancestor->cloneNode(0);
 2120:     for (my $child=$ancestor->firstChild; $child != $middle_node; $child=$child->nextSibling) {
 2121:       $left->appendChild($child->cloneNode(1));
 2122:     }
 2123:   }
 2124:   $middle = $ancestor->cloneNode(0);
 2125:   if ($middle_node == $node) {
 2126:     $middle->appendChild($middle_node->cloneNode(1));
 2127:   } else {
 2128:     my $subres = clone_ancestor_around_node($middle_node, $node);
 2129:     my $subleft = $subres->{'left'};
 2130:     if (defined $subleft) {
 2131:       if (!defined $left) {
 2132:         $left = $ancestor->cloneNode(0);
 2133:       }
 2134:       $left->appendChild($subleft);
 2135:     }
 2136:     $middle->appendChild($subres->{'middle'});
 2137:     my $subright = $subres->{'right'};
 2138:     if (defined $subright) {
 2139:       $right = $ancestor->cloneNode(0);
 2140:       $right->appendChild($subright);
 2141:     }
 2142:   }
 2143:   if (defined $middle_node->nextSibling) {
 2144:     if (!defined $right) {
 2145:       $right = $ancestor->cloneNode(0);
 2146:     }
 2147:     for (my $child=$middle_node->nextSibling; defined $child; $child=$child->nextSibling) {
 2148:       $right->appendChild($child->cloneNode(1));
 2149:     }
 2150:   }
 2151:   my %result = ();
 2152:   $result{'left'} = $left;
 2153:   $result{'middle'} = $middle;
 2154:   $result{'right'} = $right;
 2155:   return(\%result);
 2156: }
 2157: 
 2158: sub is_ancestor_of {
 2159:   my ($n1, $n2) = @_;
 2160:   my $n = $n2->parentNode;
 2161:   while (defined $n) {
 2162:     if ($n == $n1) {
 2163:       return(1);
 2164:     }
 2165:     $n = $n->parentNode;
 2166:   }
 2167:   return(0);
 2168: }
 2169: 
 2170: # removes empty style elements and replaces the ones with only whitespaces inside by their content
 2171: # also remove hints that have become empty after empty style removal.
 2172: sub remove_empty_style {
 2173:   my ($root) = @_;
 2174:   # actually, preserve some elements like ins when they have whitespace, only remove if they are empty
 2175:   my @remove_if_empty = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u', 'hint');
 2176:   my @remove_if_blank = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'font', 'u', 'hint');
 2177:   foreach my $name (@remove_if_empty) {
 2178:     my @nodes = $root->getElementsByTagName($name);
 2179:     while (scalar(@nodes) > 0) {
 2180:       my $node = pop(@nodes);
 2181:       if (!defined $node->firstChild) {
 2182:         my $parent = $node->parentNode;
 2183:         if (defined $node->previousSibling && $node->previousSibling->nodeType == XML_TEXT_NODE &&
 2184:             $node->previousSibling->nodeValue =~ /\$\S*$/) {
 2185:           # case $a<sup></sup>x
 2186:           my $value = $node->previousSibling->nodeValue;
 2187:           $value =~ s/\$(\S*)$/\$\{$1\}/;
 2188:           $node->previousSibling->setData($value);
 2189:         }
 2190:         $parent->removeChild($node);
 2191:         $parent->normalize();
 2192:         # now that we removed the node, check if the parent has become an empty style, and so on
 2193:         while (defined $parent && string_in_array(\@remove_if_empty, $parent->nodeName) && !defined $parent->firstChild) {
 2194:           my $grandparent = $parent->parentNode;
 2195:           $grandparent->removeChild($parent);
 2196:           remove_reference_from_array(\@nodes, $parent);
 2197:           $parent = $grandparent;
 2198:         }
 2199:       }
 2200:     }
 2201:   }
 2202:   foreach my $name (@remove_if_blank) {
 2203:     my @nodes = $root->getElementsByTagName($name);
 2204:     while (scalar(@nodes) > 0) {
 2205:       my $node = pop(@nodes);
 2206:       if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE) {
 2207:         # NOTE: careful, with UTF-8, \s matches non-breaking spaces and we want to preserve these
 2208:         if ($node->firstChild->nodeValue =~ /^[\t\n\f\r ]*$/) {
 2209:           my $parent = $node->parentNode;
 2210:           replace_by_children($node);
 2211:           $parent->normalize();
 2212:           # now that we removed the node, check if the parent has become a style with only whitespace, and so on
 2213:           while (defined $parent && string_in_array(\@remove_if_blank, $parent->nodeName) &&
 2214:               (!defined $parent->firstChild ||
 2215:               (!defined $parent->firstChild->nextSibling && $parent->firstChild->nodeType == XML_TEXT_NODE &&
 2216:               $parent->firstChild->nodeValue =~ /^^[\t\n\f\r ]*$/))) {
 2217:             my $grandparent = $parent->parentNode;
 2218:             replace_by_children($parent);
 2219:             remove_reference_from_array(\@nodes, $parent);
 2220:             $parent = $grandparent;
 2221:           }
 2222:         }
 2223:       }
 2224:     }
 2225:   }
 2226: }
 2227: 
 2228: # remove whitespace inside LON-CAPA elements that have an empty content-model (HTML ones are handled by html_to_xml)
 2229: sub fix_empty_lc_elements {
 2230:   my ($node) = @_;
 2231:   my @lcempty = ('arc','axis','backgroundplot','drawoptionlist','drawvectorsum','fill','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','hiddenline','hiddensubmission','key','line','location','organicstructure','parameter','plotobject','plotvector','responseparam','spline','textline');
 2232:   if (string_in_array(\@lcempty, $node->nodeName)) {
 2233:     if (defined $node->firstChild && !defined $node->firstChild->nextSibling &&
 2234:         $node->firstChild->nodeType == XML_TEXT_NODE && $node->firstChild->nodeValue =~ /^\s*$/) {
 2235:       $node->removeChild($node->firstChild);
 2236:     }
 2237:     if (defined $node->firstChild) {
 2238:       if ($warnings) {
 2239:         print "Warning: a ".$node->nodeName." has something inside\n";
 2240:       }
 2241:     }
 2242:     return;
 2243:   }
 2244:   for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
 2245:     if ($child->nodeType == XML_ELEMENT_NODE) {
 2246:       fix_empty_lc_elements($child);
 2247:     }
 2248:   }
 2249: }
 2250: 
 2251: # turn some attribute values into lowercase when they should be
 2252: sub lowercase_attribute_values {
 2253:   my ($root) = @_;
 2254:   my @with_yesno = (['radiobuttonresponse', ['randomize']],
 2255:                     ['optionresponse', ['randomize']],
 2256:                     ['matchresponse', ['randomize']],
 2257:                     ['itemgroup', ['randomize']],
 2258:                     ['rankresponse', ['randomize']],
 2259:                     ['functionplotresponse', ['xaxisvisible', 'yaxisvisible', 'gridvisible']],
 2260:                     ['backgroundplot', ['fixed']],
 2261:                     ['drawvectorsum', ['showvalue']],
 2262:                     ['textline', ['readonly']],
 2263:                     ['hint', ['showoncorrect']],
 2264:                     ['body', ['dir']],
 2265:                     ['img', ['encrypturl']],
 2266:                     ['form', ['method']],
 2267:                     ['input', ['type']]
 2268:                    );
 2269:   foreach my $el_attributes (@with_yesno) {
 2270:     my $el_name = $el_attributes->[0];
 2271:     my @elements = $root->getElementsByTagName($el_name);
 2272:     foreach my $element (@elements) {
 2273:       my $att_list = $el_attributes->[1];
 2274:       foreach my $att_name (@$att_list) {
 2275:         my $att_value = $element->getAttribute($att_name);
 2276:         if (!defined $att_value) {
 2277:           next;
 2278:         }
 2279:         if ($att_value eq 'yes' || $att_value eq 'no') {
 2280:           next;
 2281:         }
 2282:         if ($att_value =~ /\s*yes\s*/i) {
 2283:           $element->setAttribute($att_name, 'yes');
 2284:         } elsif ($att_value =~ /\s*no\s*/i) {
 2285:           $element->setAttribute($att_name, 'no');
 2286:         }
 2287:       }
 2288:     }
 2289:   }
 2290: }
 2291: 
 2292: # fixes spelling mistakes for numericalresponse/@unit
 2293: sub replace_numericalresponse_unit_attribute {
 2294:   my ($root) = @_;
 2295:   my @numericalresponses = $root->getElementsByTagName('numericalresponse');
 2296:   foreach my $numericalresponse (@numericalresponses) {
 2297:     if (defined $numericalresponse->getAttribute('units') && !defined $numericalresponse->getAttribute('unit')) {
 2298:       $numericalresponse->setAttribute('unit', $numericalresponse->getAttribute('units'));
 2299:       $numericalresponse->removeAttribute('units');
 2300:     }
 2301:   }
 2302:   
 2303: }
 2304: 
 2305: # Replaces &format and &prettyprint by <num> whenever possible.
 2306: # Also replaces &chemparse by <chem>.
 2307: # If the function call is enclosed in <display>, the <display> element is removed.
 2308: sub replace_functions_by_elements {
 2309:   my ($root) = @_;
 2310:   my $doc = $root->ownerDocument;
 2311:   my @preserve = ('script','answer','parse','m','tm','dtm','numericalhintscript'); # display is handled later
 2312:   my @all = $root->getElementsByTagName('*');
 2313:   foreach my $element (@all) {
 2314:     if (string_in_array(\@preserve, $element->nodeName)) {
 2315:       next;
 2316:     }
 2317:     my $changed = 0;
 2318:     my $next;
 2319:     for (my $child=$element->firstChild; defined $child; $child=$next) {
 2320:       $next = $child->nextSibling;
 2321:       if ($child->nodeType == XML_TEXT_NODE) {
 2322:         my $value = $child->nodeValue;
 2323:         if ($value =~ /^(.*)&(?:format|prettyprint)\((\$\{?[a-zA-Z0-9]*\}?(?:\[[^\]]*\])?|[0-9.]+)\s?,\s?(["'][,.\$]?[0-9][eEfFgGsS]["']|\$[a-zA-Z0-9]*)\)(.*)$/s) {
 2324:           # NOTE: we don't check for &prettyprint's 3rd argument (target), but it has not been seen outside of script elements.
 2325:           # NOTE: the format options ',' and '$' are not supported by &format in current LON-CAPA since rev 1.81 of default_homework.lcpm,
 2326:           #       but are supported by &prettyprint;
 2327:           #       if we use (like current LON-CAPA) &prettyprint for <num> implementation, it will change a few resulting documents
 2328:           #       (by making them display something they were probably intended to display, but which did not).
 2329:           #       Usage of <num> with &prettyprint instead of &format might also change the display when there is an exponent.
 2330:           my $before = $1;
 2331:           my $number = $2;
 2332:           my $format = $3;
 2333:           my $after = $4;
 2334:           $format =~ s/^['"]|['"]$//g;
 2335:           # do not change this if the parent is <display> and there are other things before or after &format
 2336:           if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
 2337:               $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
 2338:             last;
 2339:           }
 2340:           my $replacement = $doc->createDocumentFragment();
 2341:           my $num = $doc->createElement('num');
 2342:           $num->setAttribute('format', $format);
 2343:           $num->appendChild($doc->createTextNode($number));
 2344:           if (length($before) > 0) {
 2345:             $replacement->appendChild($doc->createTextNode($before));
 2346:           }
 2347:           $replacement->appendChild($num);
 2348:           if (length($after) > 0) {
 2349:             $replacement->appendChild($doc->createTextNode($after));
 2350:           }
 2351:           $element->replaceChild($replacement, $child);
 2352:           $changed = 1;
 2353:           $next = $element->firstChild; # start over, there might be another &format in the same text node
 2354:         } elsif ($value =~ /^(.*)&chemparse\(([^'"()]*|'[^']*'|"[^"]*")\)(.*)$/s) {
 2355:           my $before = $1;
 2356:           my $reaction = $2;
 2357:           my $after = $3;
 2358:           $reaction =~ s/^'(.*)'$/$1/;
 2359:           $reaction =~ s/^"(.*)"$/$1/;
 2360:           if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
 2361:               $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
 2362:             last;
 2363:           }
 2364:           my $replacement = $doc->createDocumentFragment();
 2365:           my $chem = $doc->createElement('chem');
 2366:           $chem->appendChild($doc->createTextNode($reaction));
 2367:           if (length($before) > 0) {
 2368:             $replacement->appendChild($doc->createTextNode($before));
 2369:           }
 2370:           $replacement->appendChild($chem);
 2371:           if (length($after) > 0) {
 2372:             $replacement->appendChild($doc->createTextNode($after));
 2373:           }
 2374:           $element->replaceChild($replacement, $child);
 2375:           $changed = 1;
 2376:           $next = $element->firstChild;
 2377:         }
 2378:       }
 2379:     }
 2380:     if ($changed && $element->nodeName eq 'display') {
 2381:       my $first = $element->firstChild;
 2382:       if ($first->nodeType == XML_ELEMENT_NODE && string_in_array(['num','chem'], $first->nodeName) &&
 2383:           !defined $first->nextSibling) {
 2384:         # remove useless display element
 2385:         replace_by_children($element);
 2386:       }
 2387:     }
 2388:   }
 2389: }
 2390: 
 2391: # pretty-print using im-memory DOM tree
 2392: sub pretty {
 2393:   my ($node, $all_block, $indent_level) = @_;
 2394:   my $doc = $node->ownerDocument;
 2395:   $indent_level ||= 0;
 2396:   my $type = $node->nodeType;
 2397:   if ($type == XML_ELEMENT_NODE) {
 2398:     my $name = $node->nodeName;
 2399:     if (string_in_array(\@preserve_elements, $name)) {
 2400:       # remove newlines at the beginning and the end of preserve elements
 2401:       if (defined $node->firstChild && ($node->firstChild->nodeType == XML_TEXT_NODE ||
 2402:           $node->firstChild->nodeType == XML_CDATA_SECTION_NODE)) {
 2403:         my $text = $node->firstChild->nodeValue;
 2404:         $text =~ s/^\n+//;
 2405:         if ($text eq '') {
 2406:           $node->removeChild($node->firstChild);
 2407:         } else {
 2408:           $node->firstChild->setData($text);
 2409:         }
 2410:       }
 2411:       if (defined $node->lastChild && ($node->lastChild->nodeType == XML_TEXT_NODE ||
 2412:           $node->lastChild->nodeType == XML_CDATA_SECTION_NODE)) {
 2413:         my $text = $node->lastChild->nodeValue;
 2414:         $text =~ s/\n+$//;
 2415:         if ($text eq '') {
 2416:           $node->removeChild($node->lastChild);
 2417:         } else {
 2418:           $node->lastChild->setData($text);
 2419:         }
 2420:       }
 2421:     } elsif (string_in_array($all_block, $name) || string_in_array(\@inline_like_block, $name)) {
 2422:       # make sure there is a newline at the beginning and at the end if there is anything inside
 2423:       if (defined $node->firstChild && !string_in_array(\@no_newline_inside, $name)) {
 2424:         my $first = $node->firstChild;
 2425:         if ($first->nodeType == XML_TEXT_NODE) {
 2426:           my $text = $first->nodeValue;
 2427:           if ($text !~ /^ *\n/) {
 2428:             $first->setData("\n" . $text);
 2429:           }
 2430:         } else {
 2431:           $node->insertBefore($doc->createTextNode("\n"), $first);
 2432:         }
 2433:         my $last = $node->lastChild;
 2434:         if ($last->nodeType == XML_TEXT_NODE) {
 2435:           my $text = $last->nodeValue;
 2436:           if ($text !~ /\n *$/) {
 2437:             $last->setData($text . "\n");
 2438:           }
 2439:         } else {
 2440:           $node->appendChild($doc->createTextNode("\n"));
 2441:         }
 2442:       }
 2443:       
 2444:       # indent and make sure there is a newline before and after a block element
 2445:       my $newline_indent = "\n".(' ' x (2*($indent_level + 1)));
 2446:       my $newline_indent_last = "\n".(' ' x (2*$indent_level));
 2447:       my $next;
 2448:       for (my $child=$node->firstChild; defined $child; $child=$next) {
 2449:         $next = $child->nextSibling;
 2450:         if ($child->nodeType == XML_ELEMENT_NODE) {
 2451:           if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
 2452:             # make sure there is a newline before and after a block element
 2453:             if (defined $child->previousSibling && $child->previousSibling->nodeType == XML_TEXT_NODE) {
 2454:               my $prev = $child->previousSibling;
 2455:               my $text = $prev->nodeValue;
 2456:               if ($text !~ /\n *$/) {
 2457:                 $prev->setData($text . $newline_indent);
 2458:               }
 2459:             } else {
 2460:               $node->insertBefore($doc->createTextNode($newline_indent), $child);
 2461:             }
 2462:             if (defined $next && $next->nodeType == XML_TEXT_NODE) {
 2463:               my $text = $next->nodeValue;
 2464:               if ($text !~ /^ *\n/) {
 2465:                 $next->setData($newline_indent . $text);
 2466:               }
 2467:             } else {
 2468:               $node->insertAfter($doc->createTextNode($newline_indent), $child);
 2469:             }
 2470:           }
 2471:           pretty($child, $all_block, $indent_level+1);
 2472:         } elsif ($child->nodeType == XML_TEXT_NODE) {
 2473:           my $text = $child->nodeValue;
 2474:           # collapse newlines
 2475:           $text =~ s/\n([\t ]*\n)+/\n/g;
 2476:           # indent and remove spaces and tabs before newlines
 2477:           if (defined $next) {
 2478:             $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
 2479:           } else {
 2480:             $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
 2481:             $text =~ s/[\t ]*\n[\t ]*$/$newline_indent_last/e;
 2482:           }
 2483:           $child->setData($text);
 2484:         }
 2485:       }
 2486:       
 2487:       # removes whitespace at the beginning and end of p td, th and li (except for nbsp at the beginning)
 2488:       my @to_trim = ('p','td','th','li');
 2489:       if (string_in_array(\@to_trim, $name) && defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
 2490:         my $text = $node->firstChild->nodeValue;
 2491:         $text =~ s/^[ \t\f\n\r]*//;
 2492:         if ($text eq '') {
 2493:           $node->removeChild($node->firstChild);
 2494:         } else {
 2495:           $node->firstChild->setData($text);
 2496:         }
 2497:       }
 2498:       if (string_in_array(\@to_trim, $name) && defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
 2499:         my $text = $node->lastChild->nodeValue;
 2500:         $text =~ s/\s*$//;
 2501:         if ($text eq '') {
 2502:           $node->removeChild($node->lastChild);
 2503:         } else {
 2504:           $node->lastChild->setData($text);
 2505:         }
 2506:       }
 2507:     }
 2508:   }
 2509: }
 2510: 
 2511: sub replace_tm_dtm {
 2512:   my ($root) = @_;
 2513:   my $doc = $root->ownerDocument;
 2514:   my @elements = $root->getElementsByTagName('tm');
 2515:   push(@elements, $root->getElementsByTagName('dtm'));
 2516:   foreach my $element (@elements) {
 2517:     my $first = $element->firstChild;
 2518:     if (defined $first && $first->nodeType == XML_TEXT_NODE) {
 2519:       my $text = $first->nodeValue;
 2520:       if ($element->nodeName eq 'tm') {
 2521:         $first->setData('$'.$text.'$');
 2522:       } else {
 2523:         $first->setData('$$'.$text.'$$');
 2524:       }
 2525:     }
 2526:     $element->setNodeName('m');
 2527:   }
 2528: }
 2529: 
 2530: 
 2531: ######## utilities ########
 2532: 
 2533: ##
 2534: # Trims a string (really, this should be built-in in Perl, this is ridiculous, ugly and slow)
 2535: # @param {string} s - the string to trim
 2536: # @returns the trimmed string
 2537: ##
 2538: sub trim {
 2539:   my ($s) = @_;
 2540:   $s =~ s/^\s+//;
 2541:   $s =~ s/\s+$//;
 2542:   return($s);
 2543: }
 2544: 
 2545: ##
 2546: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
 2547: # @param {Array<string>} array - reference to the array of strings
 2548: # @param {string} value - the string to look for
 2549: # @returns 1 if found, 0 otherwise
 2550: ##
 2551: sub string_in_array {
 2552:   my ($array, $value) = @_;
 2553:   # NOTE: would it be faster to use "any { $value eq $_ } @{$array}" from List::Util ?
 2554:   foreach my $v (@{$array}) {
 2555:     if ($v eq $value) {
 2556:       return 1;
 2557:     }
 2558:   }
 2559:   return 0;
 2560: }
 2561: 
 2562: ##
 2563: # Tests if an object is in an array (using ==)
 2564: # @param {Array<Object>} array - reference to the array of references
 2565: # @param {Object} ref - the reference to look for
 2566: # @returns 1 if found, 0 otherwise
 2567: ##
 2568: sub reference_in_array {
 2569:   my ($array, $ref) = @_;
 2570:   foreach my $v (@{$array}) {
 2571:     if ($v == $ref) {
 2572:       return 1;
 2573:     }
 2574:   }
 2575:   return 0;
 2576: }
 2577: 
 2578: ##
 2579: # returns the index of a string in an array
 2580: # @param {Array<Object>} array - reference to the array of strings
 2581: # @param {string} s - the string to look for (using eq)
 2582: # @returns the index if found, -1 otherwise
 2583: ##
 2584: sub index_of_string {
 2585:   my ($array, $s) = @_;
 2586:   for (my $i=0; $i<scalar(@{$array}); $i++) {
 2587:     if ($array->[$i] eq $s) {
 2588:       return $i;
 2589:     }
 2590:   }
 2591:   return -1;
 2592: }
 2593: 
 2594: ##
 2595: # returns the index of a reference in an array
 2596: # @param {Array<Object>} array - reference to the array of references
 2597: # @param {Object} ref - the reference to look for
 2598: # @returns the index if found, -1 otherwise
 2599: ##
 2600: sub index_of_reference {
 2601:   my ($array, $ref) = @_;
 2602:   for (my $i=0; $i<scalar(@{$array}); $i++) {
 2603:     if ($array->[$i] == $ref) {
 2604:       return $i;
 2605:     }
 2606:   }
 2607:   return -1;
 2608: }
 2609: 
 2610: ##
 2611: # if found, removes a string from an array, otherwise do nothing
 2612: # @param {Array<string>} array - reference to the array of string
 2613: # @param {string} s - the string to look for (using eq)
 2614: ##
 2615: sub remove_string_from_array {
 2616:   my ($array, $s) = @_;
 2617:   my $index = index_of_string($array, $s);
 2618:   if ($index != -1) {
 2619:     splice(@$array, $index, 1);
 2620:   }
 2621: }
 2622: 
 2623: ##
 2624: # if found, removes a reference from an array, otherwise do nothing
 2625: # @param {Array<Object>} array - reference to the array of references
 2626: # @param {Object} ref - the reference to look for
 2627: ##
 2628: sub remove_reference_from_array {
 2629:   my ($array, $ref) = @_;
 2630:   my $index = index_of_reference($array, $ref);
 2631:   if ($index != -1) {
 2632:     splice(@$array, $index, 1);
 2633:   }
 2634: }
 2635: 
 2636: ##
 2637: # replaces a node by its children
 2638: # @param {Node} node - the DOM node
 2639: ##
 2640: sub replace_by_children {
 2641:   my ($node) = @_;
 2642:   my $parent = $node->parentNode;
 2643:   my $next;
 2644:   my $previous;
 2645:   for (my $child=$node->firstChild; defined $child; $child=$next) {
 2646:     $next = $child->nextSibling;
 2647:     if ((!defined $previous || !defined $next) &&
 2648:         $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
 2649:       next; # do not keep first and last whitespace nodes
 2650:     } else {
 2651:       if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
 2652:         # remove whitespace at the beginning
 2653:         my $value = $child->nodeValue;
 2654:         $value =~ s/^\s+//;
 2655:         $child->setData($value);
 2656:       }
 2657:       if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
 2658:         # and at the end
 2659:         my $value = $child->nodeValue;
 2660:         $value =~ s/\s+$//;
 2661:         $child->setData($value);
 2662:       }
 2663:     }
 2664:     $node->removeChild($child);
 2665:     $parent->insertBefore($child, $node);
 2666:     $previous = $child;
 2667:   }
 2668:   $parent->removeChild($node);
 2669: }
 2670: 
 2671: ##
 2672: # returns the trimmed attribute value if the attribute exists and is not blank, undef otherwise
 2673: # @param {Node} node - the DOM node
 2674: # @param {string} attribute_name - the attribute name
 2675: ##
 2676: sub get_non_empty_attribute {
 2677:   my ($node, $attribute_name) = @_;
 2678:   my $value = $node->getAttribute($attribute_name);
 2679:   if (defined $value && $value !~ /^\s*$/) {
 2680:     $value = trim($value);
 2681:     return($value);
 2682:   }
 2683:   return(undef);
 2684: }
 2685: 
 2686: ##
 2687: # Returns a CSS property value from the style attribute of the element, or undef if not defined
 2688: # @param {Element} el - the DOM element
 2689: # @param {string} property_name - the CSS property name
 2690: ##
 2691: sub get_css_property {
 2692:   my ($el, $property_name) = @_;
 2693:   my $style = $el->getAttribute('style');
 2694:   if (defined $style) {
 2695:     $style =~ s/^\s*;\s*//;
 2696:     $style =~ s/\s*;\s*$//;
 2697:   } else {
 2698:     $style = '';
 2699:   }
 2700:   my @pairs = split(';', $style);
 2701:   foreach my $pair (@pairs) {
 2702:     my @name_value = split(':', $pair);
 2703:     if (scalar(@name_value) != 2) {
 2704:       next;
 2705:     }
 2706:     my $name = trim($name_value[0]);
 2707:     my $value = trim($name_value[1]);
 2708:     if (lc($name) eq $property_name) {
 2709:       return($value); # return the first one found
 2710:     }
 2711:   }
 2712:   return(undef);
 2713: }
 2714: 
 2715: ##
 2716: # Returns the reference to a hash CSS property name => value from the style attribute of the element.
 2717: # Returns an empty list if the style attribute is not defined,
 2718: # @param {Element} el - the DOM element
 2719: # @return {Hash<string, string>} reference to the hash  property name => property value
 2720: ##
 2721: sub get_css_properties {
 2722:   my ($el) = @_;
 2723:   my $style = $el->getAttribute('style');
 2724:   if (defined $style) {
 2725:     $style =~ s/^\s*;\s*//;
 2726:     $style =~ s/\s*;\s*$//;
 2727:   } else {
 2728:     $style = '';
 2729:   }
 2730:   my @pairs = split(';', $style);
 2731:   tie (my %hash, 'Tie::IxHash', ());
 2732:   foreach my $pair (@pairs) {
 2733:     my @name_value = split(':', $pair);
 2734:     if (scalar(@name_value) != 2) {
 2735:       next;
 2736:     }
 2737:     my $name = trim($name_value[0]);
 2738:     my $value = trim($name_value[1]);
 2739:     if (defined $hash{$name}) {
 2740:       # duplicate property in the style attribute: keep only the last one
 2741:       delete $hash{$name};
 2742:     }
 2743:     $hash{$name} = $value;
 2744:   }
 2745:   return(\%hash);
 2746: }
 2747: 
 2748: ##
 2749: # Sets a CSS property in the style attribute of an element
 2750: # @param {Element} el - the DOM element
 2751: # @param {string} property_name - the CSS property name
 2752: # @param {string} property_value - the CSS property value
 2753: ##
 2754: sub set_css_property {
 2755:   my ($el, $property_name, $property_value) = @_;
 2756:   my $hash_ref = { $property_name => $property_value };
 2757:   set_css_properties($el, $hash_ref);
 2758: }
 2759: 
 2760: ##
 2761: # Sets several CSS properties in the style attribute of an element
 2762: # @param {Element} el - the DOM element
 2763: # @param {Hash<string, string>} properties - reference to the hash property name => property value
 2764: ##
 2765: sub set_css_properties {
 2766:   my ($el, $properties) = @_;
 2767:   my $hash = get_css_properties($el);
 2768:   foreach my $property_name (keys %$properties) {
 2769:     my $property_value = $properties->{$property_name};
 2770:     if (defined $hash->{$property_name}) {
 2771:       delete $hash->{$property_name}; # to add the new one at the end
 2772:     }
 2773:     $hash->{$property_name} = $property_value;
 2774:   }
 2775:   my $style = '';
 2776:   foreach my $key (keys %$hash) {
 2777:     $style .= $key.':'.$hash->{$key}.'; ';
 2778:   }
 2779:   $style =~ s/; $//;
 2780:   $el->setAttribute('style', $style);
 2781: }
 2782: 
 2783: 1;
 2784: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>