Annotation of loncom/homework/cleanxml/xml_to_loncapa.pm, revision 1.6

1.1       damieng     1: # The LearningOnline Network
                      2: # convert_file takes a well-formed XML file content and converts it to LON-CAPA syntax.
                      3: #
1.6     ! damieng     4: # $Id: xml_to_loncapa.pm,v 1.5 2016/01/06 16:44:32 damieng Exp $
1.1       damieng     5: #
                      6: # Copyright Michigan State University Board of Trustees
                      7: #
                      8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                      9: #
                     10: # LON-CAPA is free software; you can redistribute it and/or modify
                     11: # it under the terms of the GNU General Public License as published by
                     12: # the Free Software Foundation; either version 2 of the License, or
                     13: # (at your option) any later version.
                     14: #
                     15: # LON-CAPA is distributed in the hope that it will be useful,
                     16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     18: # GNU General Public License for more details.
                     19: #
                     20: # You should have received a copy of the GNU General Public License
                     21: # along with LON-CAPA; if not, write to the Free Software
                     22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     23: #
                     24: # /home/httpd/html/adm/gpl.txt
                     25: #
                     26: # http://www.lon-capa.org/
                     27: #
                     28: ###
                     29: 
                     30: #!/usr/bin/perl
                     31: 
                     32: package Apache::xml_to_loncapa;
                     33: 
                     34: use strict;
                     35: use utf8;
                     36: use warnings;
                     37: 
                     38: use XML::LibXML;
                     39: 
                     40: 
                     41: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
                     42: 
                     43: my @loncapa_inline = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto'); # not textline
                     44: 
                     45: # HTML elements that trigger the addition of startouttext/endouttext
                     46: my @html_trigger = ('header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','img','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table');
                     47: 
                     48: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'data', 'function', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend' );
                     49: 
                     50: my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse');
                     51: 
1.4       damieng    52: # see http://www.w3.org/TR/html-polyglot/#empty-elements
                     53: # and http://tiffanybbrown.com/2011/03/23/html5-does-not-allow-self-closing-tags/
                     54: # HTML elements that do not have an empty content, and must never use a self-closing tag:
                     55: my @non_empty_html = ('title','style','script','noscript','body','section','header','footer','article','aside','nav','h1','h2','h3','h4','h5','h6','div','p','li','dt','dd','caption','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','address','blockquote','bdo','ruby','rb','rp','rt','rtc','figure','figcaption','object','applet','video','audio','canvas','label','option','textarea','fieldset','legend','button','iframe');
                     56: 
1.1       damieng    57: 
                     58: # Converts a file and return the modified contents
                     59: sub convert_file {
                     60:   my ($contents) = @_;
                     61: 
                     62:   my $dom_doc = XML::LibXML->load_xml(string => $contents);
1.5       damieng    63:   my $root = $dom_doc->documentElement();
                     64:   if (defined $root && $root->nodeName ne 'html') {
                     65:     add_outtext($dom_doc);
                     66:   }
1.1       damieng    67:   return node_to_string($dom_doc);
                     68: }
                     69: 
                     70: 
                     71: sub node_to_string {
                     72:   my ($node) = @_;
                     73:   
                     74:   if ($node->nodeType == XML_DOCUMENT_NODE) {
                     75:     my $root = $node->documentElement();
                     76:     return node_to_string($root);
                     77:   } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
                     78:     my $parent = $node->parentNode;
                     79:     my $parent_name = $parent->nodeName;
                     80:     my $grandparent_name;
                     81:     if (defined $parent->parentNode) {
                     82:       $grandparent_name = $parent->parentNode->nodeName;
                     83:     }
1.6     ! damieng    84:     my @no_escape = ('m', 'script', 'style', 'display', 'parse', 'answer');
1.1       damieng    85:     if (string_in_array(\@no_escape, $parent_name) &&
                     86:         ($parent_name ne 'answer' ||
                     87:         (defined $grandparent_name &&
                     88:         $grandparent_name ne 'numericalresponse' &&
                     89:         $grandparent_name ne 'formularesponse'))) {
                     90:       return $node->nodeValue;
                     91:     } else {
                     92:       return $node->toString();
                     93:     }
                     94:   } elsif ($node->nodeType == XML_ELEMENT_NODE) {
                     95:     my $s = '';
                     96:     my $tag = $node->nodeName;
                     97:     $s .= "<$tag";
                     98:     my @attributes = $node->attributes();
                     99:     foreach my $attribute (@attributes) {
                    100:       $s .= ' ';
                    101:       $s .= $attribute->nodeName;
                    102:       $s .= '="';
1.2       damieng   103:       $s .= escape_attribute($attribute->nodeValue);
1.1       damieng   104:       $s .= '"';
                    105:     }
1.4       damieng   106:     if ($node->hasChildNodes() || string_in_array(\@non_empty_html, $tag)) {
1.1       damieng   107:       $s .= '>';
                    108:       foreach my $child ($node->childNodes) {
                    109:         $s .= node_to_string($child);
                    110:       }
                    111:       $s .= "</$tag>";
                    112:     } else {
                    113:       $s .= '/>';
                    114:     }
                    115:     return $s;
                    116:   } else {
                    117:     return $node->toString();
                    118:   }
                    119: }
                    120: 
1.2       damieng   121: # Escapes an attribute value
                    122: sub escape_attribute {
1.1       damieng   123:   my ($s) = @_;
1.2       damieng   124:   # normal XML escapes do not work with LON-CAPA, for instance with reactionresponse
                    125:   #$s =~ s/&/&amp;/sg;
                    126:   #$s =~ s/</&lt;/sg;
                    127:   #$s =~ s/>/&gt;/sg;
                    128:   $s =~ s/"/&quot;/sg;
1.1       damieng   129:   return $s;
                    130: }
                    131: 
                    132: # Adds startouttext and endouttext where useful for the colorful editor
                    133: sub add_outtext {
                    134:   my ($node) = @_;
                    135:   
                    136:   if ($node->nodeType == XML_DOCUMENT_NODE) {
                    137:     my $root = $node->documentElement();
                    138:     add_outtext($root);
                    139:     return;
                    140:   }
                    141:   if ($node->nodeType != XML_ELEMENT_NODE) {
                    142:     return;
                    143:   }
                    144:   if (string_in_array(\@simple_data, $node->nodeName)) {
                    145:     return;
                    146:   }
                    147:   convert_paragraphs($node);
                    148:   my $next;
                    149:   my $in_outtext = 0;
                    150:   for (my $child=$node->firstChild; defined $child; $child=$next) {
                    151:     $next = $child->nextSibling;
                    152:     if (!$in_outtext && inside_outtext($child)) {
                    153:       add_startouttext($node, $child);
                    154:       $in_outtext = 1;
                    155:     } elsif ($in_outtext && !continue_outtext($child)) {
                    156:       add_endouttext($node, $child);
                    157:       $in_outtext = 0;
                    158:     }
                    159:     if (!$in_outtext) {
                    160:       add_outtext($child);
                    161:     }
                    162:   }
                    163:   if ($in_outtext) {
                    164:     add_endouttext($node);
                    165:   }
                    166: }
                    167: 
                    168: # Returns 1 if this node should trigger the addition of startouttext before it
                    169: sub inside_outtext {
                    170:   my ($node) = @_;
                    171:   if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
                    172:     return 1;
                    173:   }
                    174:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_trigger, $node->nodeName)) {
                    175:     if (contains_loncapa_block($node)) {
                    176:       return 0;
                    177:     } else {
                    178:       return 1;
                    179:     }
                    180:   }
                    181:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_inline, $node->nodeName)) {
                    182:     return 1;
                    183:   }
                    184:   return 0;
                    185: }
                    186: 
                    187: # Returns 1 if the outtext environment can continue with this node
                    188: sub continue_outtext {
                    189:   my ($node) = @_;
                    190:   if (inside_outtext($node)) {
                    191:     return 1;
                    192:   }
                    193:   if ($node->nodeType == XML_TEXT_NODE) {
                    194:     return 1; # continue even if this is just spaces
                    195:   }
                    196:   return 0;
                    197: }
                    198: 
                    199: # Returns 1 if the node contains a LON-CAPA block in a descendant.
                    200: sub contains_loncapa_block {
                    201:   my ($node) = @_;
                    202:   foreach my $child ($node->childNodes) {
                    203:     if ($child->nodeType == XML_ELEMENT_NODE) {
                    204:       if (string_in_array(\@loncapa_block, $child->nodeName)) {
                    205:         return 1;
                    206:       }
                    207:       if (contains_loncapa_block($child)) {
                    208:         return 1;
                    209:       }
                    210:     }
                    211:   }
                    212:   return 0;
                    213: }
                    214: 
                    215: sub add_startouttext {
                    216:   my ($parent, $before_node) = @_;
                    217:   my $doc = $parent->ownerDocument;
                    218:   if ($before_node->nodeType == XML_TEXT_NODE) {
                    219:     # split space at the beginning of the node
                    220:     if ($before_node->nodeValue =~ /^(\s+)(.*?)$/s) {
                    221:       my $space_node = $doc->createTextNode($1);
                    222:       $before_node->setData($2);
                    223:       $parent->insertBefore($space_node, $before_node);
                    224:     }
                    225:   }
                    226:   my $startouttext = $doc->createElement('startouttext');
                    227:   $parent->insertBefore($startouttext, $before_node);
                    228: }
                    229: 
                    230: sub add_endouttext {
                    231:   my ($parent, $before_node) = @_;
                    232:   my $doc = $parent->ownerDocument;
                    233:   my $endouttext = $doc->createElement('endouttext');
                    234:   my $before_before;
                    235:   if (defined $before_node) {
                    236:     $before_before = $before_node->previousSibling;
                    237:   } else {
                    238:     $before_before = $parent->lastChild;
                    239:   }
                    240:   if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
                    241:     # split space at the end of the node
                    242:     if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
                    243:       $before_before->setData($1);
                    244:       my $space_node = $doc->createTextNode($2);
                    245:       if (defined $before_node) {
                    246:         $parent->insertBefore($space_node, $before_node);
                    247:       } else {
                    248:         $parent->appendChild($space_node);
                    249:       }
                    250:       $before_node = $space_node;
                    251:     }
                    252:   }
                    253:   if (defined $before_node) {
                    254:     $parent->insertBefore($endouttext, $before_node);
                    255:   } else {
                    256:     $parent->appendChild($endouttext);
                    257:   }
                    258: }
                    259: 
                    260: # Convert paragraph children when one contains an inline response into content + <br>
                    261: # (the colorful editor does not support paragraphs containing inline responses)
                    262: sub convert_paragraphs {
                    263:   my ($parent) = @_;
                    264:   my $p_child_with_inline_response = 0;
                    265:   foreach my $child ($parent->childNodes) {
                    266:     if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
                    267:       foreach my $child2 ($child->childNodes) {
                    268:         if ($child2->nodeType == XML_ELEMENT_NODE) {
                    269:           if (string_in_array(\@inline_responses, $child2->nodeName)) {
                    270:             $p_child_with_inline_response = 1;
                    271:             last;
                    272:           }
                    273:         }
                    274:       }
                    275:     }
                    276:     if ($p_child_with_inline_response) {
                    277:       last;
                    278:     }
                    279:   }
                    280:   if ($p_child_with_inline_response) {
                    281:     my $doc = $parent->ownerDocument;
                    282:     my $next;
                    283:     for (my $child=$parent->firstChild; defined $child; $child=$next) {
                    284:       $next = $child->nextSibling;
                    285:       if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
                    286:         replace_by_children($child);
                    287:         if (defined $next && (defined $next->nextSibling || $next->nodeType != XML_TEXT_NODE ||
                    288:             $next->nodeValue !~ /^\s*$/)) {
                    289:           # we only add a br if there is something after
                    290:           my $br = $doc->createElement('br');
                    291:           $parent->insertBefore($br, $next);
                    292:         }
                    293:       }
                    294:     }
                    295:   }
                    296: }
                    297: 
                    298: ##
                    299: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
                    300: # @param {Array<string>} array - reference to the array of strings
                    301: # @param {string} value - the string to look for
                    302: # @returns 1 if found, 0 otherwise
                    303: ##
                    304: sub string_in_array {
                    305:   my ($array, $value) = @_;
                    306:   foreach my $v (@{$array}) {
                    307:     if ($v eq $value) {
                    308:       return 1;
                    309:     }
                    310:   }
                    311:   return 0;
                    312: }
                    313: 
                    314: ##
                    315: # replaces a node by its children
                    316: # @param {Node} node - the DOM node
                    317: ##
                    318: sub replace_by_children {
                    319:   my ($node) = @_;
                    320:   my $parent = $node->parentNode;
                    321:   my $next;
                    322:   my $previous;
                    323:   for (my $child=$node->firstChild; defined $child; $child=$next) {
                    324:     $next = $child->nextSibling;
                    325:     if ((!defined $previous || !defined $next) &&
                    326:         $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
                    327:       next; # do not keep first and last whitespace nodes
                    328:     } else {
                    329:       if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
                    330:         # remove whitespace at the beginning
                    331:         my $value = $child->nodeValue;
                    332:         $value =~ s/^\s+//;
                    333:         $child->setData($value);
                    334:       }
                    335:       if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
                    336:         # and at the end
                    337:         my $value = $child->nodeValue;
                    338:         $value =~ s/\s+$//;
                    339:         $child->setData($value);
                    340:       }
                    341:     }
                    342:     $node->removeChild($child);
                    343:     $parent->insertBefore($child, $node);
                    344:     $previous = $child;
                    345:   }
                    346:   $parent->removeChild($node);
                    347: }
                    348: 
                    349: 1;
                    350: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>