Annotation of loncom/homework/cleanxml/xml_to_loncapa.pm, revision 1.9

1.1       damieng     1: # The LearningOnline Network
                      2: # convert_file takes a well-formed XML file content and converts it to LON-CAPA syntax.
                      3: #
1.9     ! damieng     4: # $Id: xml_to_loncapa.pm,v 1.8 2016/01/20 21:24:22 damieng Exp $
1.1       damieng     5: #
                      6: # Copyright Michigan State University Board of Trustees
                      7: #
                      8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                      9: #
                     10: # LON-CAPA is free software; you can redistribute it and/or modify
                     11: # it under the terms of the GNU General Public License as published by
                     12: # the Free Software Foundation; either version 2 of the License, or
                     13: # (at your option) any later version.
                     14: #
                     15: # LON-CAPA is distributed in the hope that it will be useful,
                     16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     18: # GNU General Public License for more details.
                     19: #
                     20: # You should have received a copy of the GNU General Public License
                     21: # along with LON-CAPA; if not, write to the Free Software
                     22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     23: #
                     24: # /home/httpd/html/adm/gpl.txt
                     25: #
                     26: # http://www.lon-capa.org/
                     27: #
                     28: ###
                     29: 
                     30: #!/usr/bin/perl
                     31: 
                     32: package Apache::xml_to_loncapa;
                     33: 
                     34: use strict;
                     35: use utf8;
                     36: use warnings;
                     37: 
                     38: use XML::LibXML;
                     39: 
                     40: 
1.8       damieng    41: # LON-CAPA block elements that cannot be found within startouttext/endouttext
                     42: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
1.1       damieng    43: 
1.8       damieng    44: # LON-CAPA elements that can be found within startouttext/endouttext:
                     45: my @loncapa_in_text = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto','translated','lang'); # not textline
1.1       damieng    46: 
                     47: # HTML elements that trigger the addition of startouttext/endouttext
1.9     ! damieng    48: my @html_trigger = ('header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table');
1.1       damieng    49: 
1.9     ! damieng    50: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'data', 'function', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend','comment');
1.1       damieng    51: 
                     52: my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse');
                     53: 
1.4       damieng    54: # see http://www.w3.org/TR/html-polyglot/#empty-elements
                     55: # and http://tiffanybbrown.com/2011/03/23/html5-does-not-allow-self-closing-tags/
                     56: # HTML elements that do not have an empty content, and must never use a self-closing tag:
                     57: my @non_empty_html = ('title','style','script','noscript','body','section','header','footer','article','aside','nav','h1','h2','h3','h4','h5','h6','div','p','li','dt','dd','caption','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','address','blockquote','bdo','ruby','rb','rp','rt','rtc','figure','figcaption','object','applet','video','audio','canvas','label','option','textarea','fieldset','legend','button','iframe');
                     58: 
1.1       damieng    59: 
                     60: # Converts a file and return the modified contents
                     61: sub convert_file {
                     62:   my ($contents) = @_;
                     63: 
                     64:   my $dom_doc = XML::LibXML->load_xml(string => $contents);
1.5       damieng    65:   my $root = $dom_doc->documentElement();
                     66:   if (defined $root && $root->nodeName ne 'html') {
                     67:     add_outtext($dom_doc);
                     68:   }
1.1       damieng    69:   return node_to_string($dom_doc);
                     70: }
                     71: 
                     72: 
                     73: sub node_to_string {
                     74:   my ($node) = @_;
                     75:   
                     76:   if ($node->nodeType == XML_DOCUMENT_NODE) {
                     77:     my $root = $node->documentElement();
                     78:     return node_to_string($root);
                     79:   } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
                     80:     my $parent = $node->parentNode;
                     81:     my $parent_name = $parent->nodeName;
                     82:     my $grandparent_name;
                     83:     if (defined $parent->parentNode) {
                     84:       $grandparent_name = $parent->parentNode->nodeName;
                     85:     }
1.6       damieng    86:     my @no_escape = ('m', 'script', 'style', 'display', 'parse', 'answer');
1.1       damieng    87:     if (string_in_array(\@no_escape, $parent_name) &&
                     88:         ($parent_name ne 'answer' ||
                     89:         (defined $grandparent_name &&
                     90:         $grandparent_name ne 'numericalresponse' &&
                     91:         $grandparent_name ne 'formularesponse'))) {
                     92:       return $node->nodeValue;
                     93:     } else {
                     94:       return $node->toString();
                     95:     }
                     96:   } elsif ($node->nodeType == XML_ELEMENT_NODE) {
                     97:     my $s = '';
                     98:     my $tag = $node->nodeName;
                     99:     $s .= "<$tag";
                    100:     my @attributes = $node->attributes();
                    101:     foreach my $attribute (@attributes) {
                    102:       $s .= ' ';
                    103:       $s .= $attribute->nodeName;
                    104:       $s .= '="';
1.2       damieng   105:       $s .= escape_attribute($attribute->nodeValue);
1.1       damieng   106:       $s .= '"';
                    107:     }
1.4       damieng   108:     if ($node->hasChildNodes() || string_in_array(\@non_empty_html, $tag)) {
1.1       damieng   109:       $s .= '>';
                    110:       foreach my $child ($node->childNodes) {
                    111:         $s .= node_to_string($child);
                    112:       }
                    113:       $s .= "</$tag>";
                    114:     } else {
                    115:       $s .= '/>';
                    116:     }
                    117:     return $s;
                    118:   } else {
                    119:     return $node->toString();
                    120:   }
                    121: }
                    122: 
1.2       damieng   123: # Escapes an attribute value
                    124: sub escape_attribute {
1.1       damieng   125:   my ($s) = @_;
1.2       damieng   126:   # normal XML escapes do not work with LON-CAPA, for instance with reactionresponse
                    127:   #$s =~ s/&/&amp;/sg;
                    128:   #$s =~ s/</&lt;/sg;
                    129:   #$s =~ s/>/&gt;/sg;
                    130:   $s =~ s/"/&quot;/sg;
1.1       damieng   131:   return $s;
                    132: }
                    133: 
                    134: # Adds startouttext and endouttext where useful for the colorful editor
                    135: sub add_outtext {
                    136:   my ($node) = @_;
                    137:   
                    138:   if ($node->nodeType == XML_DOCUMENT_NODE) {
                    139:     my $root = $node->documentElement();
                    140:     add_outtext($root);
                    141:     return;
                    142:   }
                    143:   if ($node->nodeType != XML_ELEMENT_NODE) {
                    144:     return;
                    145:   }
                    146:   if (string_in_array(\@simple_data, $node->nodeName)) {
                    147:     return;
                    148:   }
                    149:   convert_paragraphs($node);
1.8       damieng   150:   if ($node->nodeName eq 'hintgroup' && !defined $node->firstChild) {
                    151:     # empty hintgroup: colorful editor needs start/end outtext
                    152:     add_endouttext($node, undef);
                    153:     add_startouttext($node, $node->firstChild);
                    154:   }
1.1       damieng   155:   my $next;
                    156:   my $in_outtext = 0;
                    157:   for (my $child=$node->firstChild; defined $child; $child=$next) {
                    158:     $next = $child->nextSibling;
                    159:     if (!$in_outtext && inside_outtext($child)) {
                    160:       add_startouttext($node, $child);
                    161:       $in_outtext = 1;
                    162:     } elsif ($in_outtext && !continue_outtext($child)) {
                    163:       add_endouttext($node, $child);
                    164:       $in_outtext = 0;
                    165:     }
                    166:     if (!$in_outtext) {
                    167:       add_outtext($child);
                    168:     }
                    169:   }
                    170:   if ($in_outtext) {
                    171:     add_endouttext($node);
                    172:   }
                    173: }
                    174: 
                    175: # Returns 1 if this node should trigger the addition of startouttext before it
                    176: sub inside_outtext {
                    177:   my ($node) = @_;
                    178:   if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
                    179:     return 1;
                    180:   }
                    181:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_trigger, $node->nodeName)) {
                    182:     if (contains_loncapa_block($node)) {
                    183:       return 0;
                    184:     } else {
                    185:       return 1;
                    186:     }
                    187:   }
1.8       damieng   188:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_in_text, $node->nodeName)) {
1.1       damieng   189:     return 1;
                    190:   }
                    191:   return 0;
                    192: }
                    193: 
                    194: # Returns 1 if the outtext environment can continue with this node
                    195: sub continue_outtext {
                    196:   my ($node) = @_;
                    197:   if (inside_outtext($node)) {
                    198:     return 1;
                    199:   }
                    200:   if ($node->nodeType == XML_TEXT_NODE) {
                    201:     return 1; # continue even if this is just spaces
                    202:   }
                    203:   return 0;
                    204: }
                    205: 
                    206: # Returns 1 if the node contains a LON-CAPA block in a descendant.
                    207: sub contains_loncapa_block {
                    208:   my ($node) = @_;
                    209:   foreach my $child ($node->childNodes) {
                    210:     if ($child->nodeType == XML_ELEMENT_NODE) {
                    211:       if (string_in_array(\@loncapa_block, $child->nodeName)) {
                    212:         return 1;
                    213:       }
                    214:       if (contains_loncapa_block($child)) {
                    215:         return 1;
                    216:       }
                    217:     }
                    218:   }
                    219:   return 0;
                    220: }
                    221: 
                    222: sub add_startouttext {
                    223:   my ($parent, $before_node) = @_;
                    224:   my $doc = $parent->ownerDocument;
                    225:   if ($before_node->nodeType == XML_TEXT_NODE) {
                    226:     # split space at the beginning of the node
                    227:     if ($before_node->nodeValue =~ /^(\s+)(.*?)$/s) {
                    228:       my $space_node = $doc->createTextNode($1);
                    229:       $before_node->setData($2);
                    230:       $parent->insertBefore($space_node, $before_node);
                    231:     }
                    232:   }
                    233:   my $startouttext = $doc->createElement('startouttext');
                    234:   $parent->insertBefore($startouttext, $before_node);
                    235: }
                    236: 
                    237: sub add_endouttext {
                    238:   my ($parent, $before_node) = @_;
                    239:   my $doc = $parent->ownerDocument;
                    240:   my $endouttext = $doc->createElement('endouttext');
                    241:   my $before_before;
                    242:   if (defined $before_node) {
                    243:     $before_before = $before_node->previousSibling;
                    244:   } else {
                    245:     $before_before = $parent->lastChild;
                    246:   }
                    247:   if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
                    248:     # split space at the end of the node
                    249:     if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
                    250:       $before_before->setData($1);
                    251:       my $space_node = $doc->createTextNode($2);
                    252:       if (defined $before_node) {
                    253:         $parent->insertBefore($space_node, $before_node);
                    254:       } else {
                    255:         $parent->appendChild($space_node);
                    256:       }
                    257:       $before_node = $space_node;
                    258:     }
                    259:   }
                    260:   if (defined $before_node) {
                    261:     $parent->insertBefore($endouttext, $before_node);
                    262:   } else {
                    263:     $parent->appendChild($endouttext);
                    264:   }
1.7       damieng   265:   # replace spaces afterwards by a \n + indentation
                    266:   my $next = $endouttext->nextSibling;
                    267:   if (defined $next && $next->nodeType == XML_TEXT_NODE) {
                    268:     my $v = $next->nodeValue;
                    269:     if ($v =~ /^ /) {
                    270:       $v =~ s/^ +//;
                    271:       if ($parent->firstChild->nodeType == XML_TEXT_NODE &&
                    272:           $parent->firstChild->nodeValue =~ /^\n +$/) {
                    273:         $v = $parent->firstChild->nodeValue.$v;
                    274:       } else {
                    275:         $v = "\n".$v;
                    276:       }
                    277:       $next->setData($v);
                    278:     }
                    279:   }
1.1       damieng   280: }
                    281: 
                    282: # Convert paragraph children when one contains an inline response into content + <br>
                    283: # (the colorful editor does not support paragraphs containing inline responses)
                    284: sub convert_paragraphs {
                    285:   my ($parent) = @_;
                    286:   my $p_child_with_inline_response = 0;
                    287:   foreach my $child ($parent->childNodes) {
                    288:     if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
                    289:       foreach my $child2 ($child->childNodes) {
                    290:         if ($child2->nodeType == XML_ELEMENT_NODE) {
                    291:           if (string_in_array(\@inline_responses, $child2->nodeName)) {
                    292:             $p_child_with_inline_response = 1;
                    293:             last;
                    294:           }
                    295:         }
                    296:       }
                    297:     }
                    298:     if ($p_child_with_inline_response) {
                    299:       last;
                    300:     }
                    301:   }
                    302:   if ($p_child_with_inline_response) {
                    303:     my $doc = $parent->ownerDocument;
                    304:     my $next;
                    305:     for (my $child=$parent->firstChild; defined $child; $child=$next) {
                    306:       $next = $child->nextSibling;
                    307:       if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
                    308:         replace_by_children($child);
                    309:         if (defined $next && (defined $next->nextSibling || $next->nodeType != XML_TEXT_NODE ||
                    310:             $next->nodeValue !~ /^\s*$/)) {
                    311:           # we only add a br if there is something after
                    312:           my $br = $doc->createElement('br');
                    313:           $parent->insertBefore($br, $next);
                    314:         }
                    315:       }
                    316:     }
                    317:   }
                    318: }
                    319: 
                    320: ##
                    321: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
                    322: # @param {Array<string>} array - reference to the array of strings
                    323: # @param {string} value - the string to look for
                    324: # @returns 1 if found, 0 otherwise
                    325: ##
                    326: sub string_in_array {
                    327:   my ($array, $value) = @_;
                    328:   foreach my $v (@{$array}) {
                    329:     if ($v eq $value) {
                    330:       return 1;
                    331:     }
                    332:   }
                    333:   return 0;
                    334: }
                    335: 
                    336: ##
                    337: # replaces a node by its children
                    338: # @param {Node} node - the DOM node
                    339: ##
                    340: sub replace_by_children {
                    341:   my ($node) = @_;
                    342:   my $parent = $node->parentNode;
                    343:   my $next;
                    344:   my $previous;
                    345:   for (my $child=$node->firstChild; defined $child; $child=$next) {
                    346:     $next = $child->nextSibling;
                    347:     if ((!defined $previous || !defined $next) &&
                    348:         $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
                    349:       next; # do not keep first and last whitespace nodes
                    350:     } else {
                    351:       if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
                    352:         # remove whitespace at the beginning
                    353:         my $value = $child->nodeValue;
                    354:         $value =~ s/^\s+//;
                    355:         $child->setData($value);
                    356:       }
                    357:       if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
                    358:         # and at the end
                    359:         my $value = $child->nodeValue;
                    360:         $value =~ s/\s+$//;
                    361:         $child->setData($value);
                    362:       }
                    363:     }
                    364:     $node->removeChild($child);
                    365:     $parent->insertBefore($child, $node);
                    366:     $previous = $child;
                    367:   }
                    368:   $parent->removeChild($node);
                    369: }
                    370: 
                    371: 1;
                    372: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>