Annotation of modules/damieng/clean_xml/xml_to_loncapa.pl, revision 1.3

1.1       damieng     1: #!/usr/bin/perl
                      2: 
                      3: # This takes a well-formed XML file as input, and converts it to LON-CAPA syntax.
                      4: 
                      5: use strict;
                      6: use utf8;
                      7: use warnings;
                      8: 
                      9: use XML::LibXML;
                     10: 
1.2       damieng    11: 
                     12: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
                     13: 
                     14: my @loncapa_inline = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto'); # not textline
                     15: 
1.3     ! damieng    16: # HTML elements that trigger the addition of startouttext/endouttext
        !            17: my @html_trigger = ('header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','img','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table');
1.2       damieng    18: 
1.3     ! damieng    19: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'data', 'function', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend' );
        !            20: 
        !            21: my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse');
1.2       damieng    22: 
                     23: 
1.1       damieng    24: binmode(STDOUT, ':encoding(UTF-8)');
                     25: 
                     26: if (scalar(@ARGV) != 1) {
                     27:   print STDERR "Usage: perl xml_to_loncapa.pl file.xml\n";
                     28:   exit(1);
                     29: }
                     30: 
                     31: # find the command-line argument encoding
                     32: use I18N::Langinfo qw(langinfo CODESET);
                     33: my $codeset = langinfo(CODESET);
                     34: use Encode qw(decode);
                     35: @ARGV = map { decode $codeset, $_ } @ARGV;
                     36: 
                     37: my $pathname = "$ARGV[0]";
                     38: if (-f $pathname) {
                     39:   convert_file($pathname);
                     40: }
                     41: 
                     42: # Converts a file, creating a .loncapa file in the same directory.
                     43: # TODO: use the right extension based on content (or just ouput content)
                     44: sub convert_file {
                     45:   my ($pathname) = @_;
                     46: 
                     47:   # create a name for the new file
                     48:   my $newpath = $pathname.'.loncapa';
                     49: 
                     50:   print "converting $pathname...\n";
                     51:   
                     52:   my $dom_doc = XML::LibXML->load_xml(location => $pathname);
                     53:   
                     54:   open my $out, '>:encoding(UTF-8)', $newpath;
1.2       damieng    55:   add_outtext($dom_doc);
1.1       damieng    56:   print $out node_to_string($dom_doc);
                     57:   close $out;
                     58: }
                     59: 
                     60: sub node_to_string {
                     61:   my ($node) = @_;
                     62:   
                     63:   if ($node->nodeType == XML_DOCUMENT_NODE) {
                     64:     my $root = $node->documentElement();
                     65:     return node_to_string($root);
                     66:   } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
                     67:     my $parent = $node->parentNode;
                     68:     my $parent_name = $parent->nodeName;
                     69:     my $grandparent_name;
                     70:     if (defined $parent->parentNode) {
                     71:       $grandparent_name = $parent->parentNode->nodeName;
                     72:     }
                     73:     my @no_escape = ('m', 'script', 'display', 'parse', 'answer');
                     74:     if (string_in_array(\@no_escape, $parent_name) &&
                     75:         ($parent_name ne 'answer' ||
                     76:         (defined $grandparent_name &&
                     77:         $grandparent_name ne 'numericalresponse' &&
                     78:         $grandparent_name ne 'formularesponse'))) {
                     79:       return $node->nodeValue;
                     80:     } else {
                     81:       return $node->toString();
                     82:     }
                     83:   } elsif ($node->nodeType == XML_ELEMENT_NODE) {
                     84:     my $s = '';
                     85:     my $tag = $node->nodeName;
                     86:     $s .= "<$tag";
                     87:     my @attributes = $node->attributes();
                     88:     foreach my $attribute (@attributes) {
                     89:       $s .= ' ';
                     90:       $s .= $attribute->nodeName;
                     91:       $s .= '="';
                     92:       $s .= escape($attribute->nodeValue);
                     93:       $s .= '"';
                     94:     }
                     95:     if ($node->hasChildNodes()) {
                     96:       $s .= '>';
                     97:       foreach my $child ($node->childNodes) {
                     98:         $s .= node_to_string($child);
                     99:       }
                    100:       $s .= "</$tag>";
                    101:     } else {
                    102:       $s .= '/>';
                    103:     }
                    104:     return $s;
                    105:   } else {
                    106:     return $node->toString();
                    107:   }
                    108: }
                    109: 
                    110: # Escapes a string for LON-CAPA output (used for text nodes, not attribute values)
                    111: sub escape {
                    112:   my ($s) = @_;
                    113:   $s =~ s/&/&amp;/sg;
                    114:   $s =~ s/</&lt;/sg;
                    115:   $s =~ s/>/&gt;/sg;
                    116:   # quot and apos do not need to be escaped outside attribute values
                    117:   return $s;
                    118: }
                    119: 
1.3     ! damieng   120: # Adds startouttext and endouttext where useful for the colorful editor
1.2       damieng   121: sub add_outtext {
                    122:   my ($node) = @_;
                    123:   
                    124:   if ($node->nodeType == XML_DOCUMENT_NODE) {
                    125:     my $root = $node->documentElement();
                    126:     add_outtext($root);
                    127:     return;
                    128:   }
                    129:   if ($node->nodeType != XML_ELEMENT_NODE) {
                    130:     return;
                    131:   }
                    132:   if (string_in_array(\@simple_data, $node->nodeName)) {
                    133:     return;
                    134:   }
1.3     ! damieng   135:   convert_paragraphs($node);
1.2       damieng   136:   my $next;
                    137:   my $in_outtext = 0;
                    138:   for (my $child=$node->firstChild; defined $child; $child=$next) {
                    139:     $next = $child->nextSibling;
                    140:     if (!$in_outtext && inside_outtext($child)) {
1.3     ! damieng   141:       add_startouttext($node, $child);
1.2       damieng   142:       $in_outtext = 1;
                    143:     } elsif ($in_outtext && !continue_outtext($child)) {
                    144:       add_endouttext($node, $child);
                    145:       $in_outtext = 0;
                    146:     }
                    147:     if (!$in_outtext) {
                    148:       add_outtext($child);
                    149:     }
                    150:   }
                    151:   if ($in_outtext) {
                    152:     add_endouttext($node);
                    153:   }
                    154: }
                    155: 
                    156: # Returns 1 if this node should trigger the addition of startouttext before it
                    157: sub inside_outtext {
                    158:   my ($node) = @_;
                    159:   if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
                    160:     return 1;
                    161:   }
1.3     ! damieng   162:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_trigger, $node->nodeName)) {
1.2       damieng   163:     if (contains_loncapa_block($node)) {
                    164:       return 0;
                    165:     } else {
                    166:       return 1;
                    167:     }
                    168:   }
                    169:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_inline, $node->nodeName)) {
                    170:     return 1;
                    171:   }
                    172:   return 0;
                    173: }
                    174: 
                    175: # Returns 1 if the outtext environment can continue with this node
                    176: sub continue_outtext {
                    177:   my ($node) = @_;
                    178:   if (inside_outtext($node)) {
                    179:     return 1;
                    180:   }
                    181:   if ($node->nodeType == XML_TEXT_NODE) {
                    182:     return 1; # continue even if this is just spaces
                    183:   }
                    184:   return 0;
                    185: }
                    186: 
                    187: # Returns 1 if the node contains a LON-CAPA block in a descendant.
                    188: sub contains_loncapa_block {
                    189:   my ($node) = @_;
                    190:   foreach my $child ($node->childNodes) {
                    191:     if ($child->nodeType == XML_ELEMENT_NODE) {
                    192:       if (string_in_array(\@loncapa_block, $child->nodeName)) {
                    193:         return 1;
                    194:       }
                    195:       if (contains_loncapa_block($child)) {
                    196:         return 1;
                    197:       }
                    198:     }
                    199:   }
                    200:   return 0;
                    201: }
                    202: 
1.3     ! damieng   203: sub add_startouttext {
        !           204:   my ($parent, $before_node) = @_;
        !           205:   my $doc = $parent->ownerDocument;
        !           206:   if ($before_node->nodeType == XML_TEXT_NODE) {
        !           207:     # split space at the beginning of the node
        !           208:     if ($before_node->nodeValue =~ /^(\s+)(.*?)$/s) {
        !           209:       my $space_node = $doc->createTextNode($1);
        !           210:       $before_node->setData($2);
        !           211:       $parent->insertBefore($space_node, $before_node);
        !           212:     }
        !           213:   }
        !           214:   my $startouttext = $doc->createElement('startouttext');
        !           215:   $parent->insertBefore($startouttext, $before_node);
        !           216: }
        !           217: 
1.2       damieng   218: sub add_endouttext {
                    219:   my ($parent, $before_node) = @_;
                    220:   my $doc = $parent->ownerDocument;
                    221:   my $endouttext = $doc->createElement('endouttext');
                    222:   my $before_before;
                    223:   if (defined $before_node) {
                    224:     $before_before = $before_node->previousSibling;
                    225:   } else {
                    226:     $before_before = $parent->lastChild;
                    227:   }
                    228:   if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
1.3     ! damieng   229:     # split space at the end of the node
1.2       damieng   230:     if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
                    231:       $before_before->setData($1);
                    232:       my $space_node = $doc->createTextNode($2);
                    233:       if (defined $before_node) {
                    234:         $parent->insertBefore($space_node, $before_node);
                    235:       } else {
                    236:         $parent->appendChild($space_node);
                    237:       }
                    238:       $before_node = $space_node;
                    239:     }
                    240:   }
                    241:   if (defined $before_node) {
                    242:     $parent->insertBefore($endouttext, $before_node);
                    243:   } else {
                    244:     $parent->appendChild($endouttext);
                    245:   }
                    246: }
                    247: 
1.3     ! damieng   248: # Convert paragraph children when one contains an inline response into content + <br>
        !           249: # (the colorful editor does not support paragraphs containing inline responses)
        !           250: sub convert_paragraphs {
        !           251:   my ($parent) = @_;
        !           252:   my $p_child_with_inline_response = 0;
        !           253:   foreach my $child ($parent->childNodes) {
        !           254:     if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
        !           255:       foreach my $child2 ($child->childNodes) {
        !           256:         if ($child2->nodeType == XML_ELEMENT_NODE) {
        !           257:           if (string_in_array(\@inline_responses, $child2->nodeName)) {
        !           258:             $p_child_with_inline_response = 1;
        !           259:             last;
        !           260:           }
        !           261:         }
        !           262:       }
        !           263:     }
        !           264:     if ($p_child_with_inline_response) {
        !           265:       last;
        !           266:     }
        !           267:   }
        !           268:   if ($p_child_with_inline_response) {
        !           269:     my $doc = $parent->ownerDocument;
        !           270:     my $next;
        !           271:     for (my $child=$parent->firstChild; defined $child; $child=$next) {
        !           272:       $next = $child->nextSibling;
        !           273:       if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
        !           274:         replace_by_children($child);
        !           275:         if (defined $next && (defined $next->nextSibling || $next->nodeType != XML_TEXT_NODE ||
        !           276:             $next->nodeValue !~ /^\s*$/)) {
        !           277:           # we only add a br if there is something after
        !           278:           my $br = $doc->createElement('br');
        !           279:           $parent->insertBefore($br, $next);
        !           280:         }
        !           281:       }
        !           282:     }
        !           283:   }
        !           284: }
        !           285: 
1.1       damieng   286: ##
                    287: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
                    288: # @param {Array<string>} array - reference to the array of strings
                    289: # @param {string} value - the string to look for
                    290: # @returns 1 if found, 0 otherwise
                    291: ##
                    292: sub string_in_array {
                    293:   my ($array, $value) = @_;
                    294:   foreach my $v (@{$array}) {
                    295:     if ($v eq $value) {
                    296:       return 1;
                    297:     }
                    298:   }
                    299:   return 0;
                    300: }
1.3     ! damieng   301: 
        !           302: ##
        !           303: # replaces a node by its children
        !           304: # @param {Node} node - the DOM node
        !           305: ##
        !           306: sub replace_by_children {
        !           307:   my ($node) = @_;
        !           308:   my $parent = $node->parentNode;
        !           309:   my $next;
        !           310:   my $previous;
        !           311:   for (my $child=$node->firstChild; defined $child; $child=$next) {
        !           312:     $next = $child->nextSibling;
        !           313:     if ((!defined $previous || !defined $next) &&
        !           314:         $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
        !           315:       next; # do not keep first and last whitespace nodes
        !           316:     } else {
        !           317:       if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
        !           318:         # remove whitespace at the beginning
        !           319:         my $value = $child->nodeValue;
        !           320:         $value =~ s/^\s+//;
        !           321:         $child->setData($value);
        !           322:       }
        !           323:       if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
        !           324:         # and at the end
        !           325:         my $value = $child->nodeValue;
        !           326:         $value =~ s/\s+$//;
        !           327:         $child->setData($value);
        !           328:       }
        !           329:     }
        !           330:     $node->removeChild($child);
        !           331:     $parent->insertBefore($child, $node);
        !           332:     $previous = $child;
        !           333:   }
        !           334:   $parent->removeChild($node);
        !           335: }
        !           336: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>