File:  [LON-CAPA] / modules / damieng / clean_xml / xml_to_loncapa.pl
Revision 1.3: download - view: text, annotated - select for diffs
Tue May 19 15:36:01 2015 UTC (9 years ago) by damieng
Branches: MAIN
CVS tags: HEAD
improved addition of startouttext/endouttext, converting paragraphs when the colorful editor cannot handle them

    1: #!/usr/bin/perl
    2: 
    3: # This takes a well-formed XML file as input, and converts it to LON-CAPA syntax.
    4: 
    5: use strict;
    6: use utf8;
    7: use warnings;
    8: 
    9: use XML::LibXML;
   10: 
   11: 
   12: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
   13: 
   14: my @loncapa_inline = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto'); # not textline
   15: 
   16: # HTML elements that trigger the addition of startouttext/endouttext
   17: my @html_trigger = ('header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','img','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table');
   18: 
   19: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'data', 'function', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend' );
   20: 
   21: my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse');
   22: 
   23: 
   24: binmode(STDOUT, ':encoding(UTF-8)');
   25: 
   26: if (scalar(@ARGV) != 1) {
   27:   print STDERR "Usage: perl xml_to_loncapa.pl file.xml\n";
   28:   exit(1);
   29: }
   30: 
   31: # find the command-line argument encoding
   32: use I18N::Langinfo qw(langinfo CODESET);
   33: my $codeset = langinfo(CODESET);
   34: use Encode qw(decode);
   35: @ARGV = map { decode $codeset, $_ } @ARGV;
   36: 
   37: my $pathname = "$ARGV[0]";
   38: if (-f $pathname) {
   39:   convert_file($pathname);
   40: }
   41: 
   42: # Converts a file, creating a .loncapa file in the same directory.
   43: # TODO: use the right extension based on content (or just ouput content)
   44: sub convert_file {
   45:   my ($pathname) = @_;
   46: 
   47:   # create a name for the new file
   48:   my $newpath = $pathname.'.loncapa';
   49: 
   50:   print "converting $pathname...\n";
   51:   
   52:   my $dom_doc = XML::LibXML->load_xml(location => $pathname);
   53:   
   54:   open my $out, '>:encoding(UTF-8)', $newpath;
   55:   add_outtext($dom_doc);
   56:   print $out node_to_string($dom_doc);
   57:   close $out;
   58: }
   59: 
   60: sub node_to_string {
   61:   my ($node) = @_;
   62:   
   63:   if ($node->nodeType == XML_DOCUMENT_NODE) {
   64:     my $root = $node->documentElement();
   65:     return node_to_string($root);
   66:   } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
   67:     my $parent = $node->parentNode;
   68:     my $parent_name = $parent->nodeName;
   69:     my $grandparent_name;
   70:     if (defined $parent->parentNode) {
   71:       $grandparent_name = $parent->parentNode->nodeName;
   72:     }
   73:     my @no_escape = ('m', 'script', 'display', 'parse', 'answer');
   74:     if (string_in_array(\@no_escape, $parent_name) &&
   75:         ($parent_name ne 'answer' ||
   76:         (defined $grandparent_name &&
   77:         $grandparent_name ne 'numericalresponse' &&
   78:         $grandparent_name ne 'formularesponse'))) {
   79:       return $node->nodeValue;
   80:     } else {
   81:       return $node->toString();
   82:     }
   83:   } elsif ($node->nodeType == XML_ELEMENT_NODE) {
   84:     my $s = '';
   85:     my $tag = $node->nodeName;
   86:     $s .= "<$tag";
   87:     my @attributes = $node->attributes();
   88:     foreach my $attribute (@attributes) {
   89:       $s .= ' ';
   90:       $s .= $attribute->nodeName;
   91:       $s .= '="';
   92:       $s .= escape($attribute->nodeValue);
   93:       $s .= '"';
   94:     }
   95:     if ($node->hasChildNodes()) {
   96:       $s .= '>';
   97:       foreach my $child ($node->childNodes) {
   98:         $s .= node_to_string($child);
   99:       }
  100:       $s .= "</$tag>";
  101:     } else {
  102:       $s .= '/>';
  103:     }
  104:     return $s;
  105:   } else {
  106:     return $node->toString();
  107:   }
  108: }
  109: 
  110: # Escapes a string for LON-CAPA output (used for text nodes, not attribute values)
  111: sub escape {
  112:   my ($s) = @_;
  113:   $s =~ s/&/&amp;/sg;
  114:   $s =~ s/</&lt;/sg;
  115:   $s =~ s/>/&gt;/sg;
  116:   # quot and apos do not need to be escaped outside attribute values
  117:   return $s;
  118: }
  119: 
  120: # Adds startouttext and endouttext where useful for the colorful editor
  121: sub add_outtext {
  122:   my ($node) = @_;
  123:   
  124:   if ($node->nodeType == XML_DOCUMENT_NODE) {
  125:     my $root = $node->documentElement();
  126:     add_outtext($root);
  127:     return;
  128:   }
  129:   if ($node->nodeType != XML_ELEMENT_NODE) {
  130:     return;
  131:   }
  132:   if (string_in_array(\@simple_data, $node->nodeName)) {
  133:     return;
  134:   }
  135:   convert_paragraphs($node);
  136:   my $next;
  137:   my $in_outtext = 0;
  138:   for (my $child=$node->firstChild; defined $child; $child=$next) {
  139:     $next = $child->nextSibling;
  140:     if (!$in_outtext && inside_outtext($child)) {
  141:       add_startouttext($node, $child);
  142:       $in_outtext = 1;
  143:     } elsif ($in_outtext && !continue_outtext($child)) {
  144:       add_endouttext($node, $child);
  145:       $in_outtext = 0;
  146:     }
  147:     if (!$in_outtext) {
  148:       add_outtext($child);
  149:     }
  150:   }
  151:   if ($in_outtext) {
  152:     add_endouttext($node);
  153:   }
  154: }
  155: 
  156: # Returns 1 if this node should trigger the addition of startouttext before it
  157: sub inside_outtext {
  158:   my ($node) = @_;
  159:   if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
  160:     return 1;
  161:   }
  162:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_trigger, $node->nodeName)) {
  163:     if (contains_loncapa_block($node)) {
  164:       return 0;
  165:     } else {
  166:       return 1;
  167:     }
  168:   }
  169:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_inline, $node->nodeName)) {
  170:     return 1;
  171:   }
  172:   return 0;
  173: }
  174: 
  175: # Returns 1 if the outtext environment can continue with this node
  176: sub continue_outtext {
  177:   my ($node) = @_;
  178:   if (inside_outtext($node)) {
  179:     return 1;
  180:   }
  181:   if ($node->nodeType == XML_TEXT_NODE) {
  182:     return 1; # continue even if this is just spaces
  183:   }
  184:   return 0;
  185: }
  186: 
  187: # Returns 1 if the node contains a LON-CAPA block in a descendant.
  188: sub contains_loncapa_block {
  189:   my ($node) = @_;
  190:   foreach my $child ($node->childNodes) {
  191:     if ($child->nodeType == XML_ELEMENT_NODE) {
  192:       if (string_in_array(\@loncapa_block, $child->nodeName)) {
  193:         return 1;
  194:       }
  195:       if (contains_loncapa_block($child)) {
  196:         return 1;
  197:       }
  198:     }
  199:   }
  200:   return 0;
  201: }
  202: 
  203: sub add_startouttext {
  204:   my ($parent, $before_node) = @_;
  205:   my $doc = $parent->ownerDocument;
  206:   if ($before_node->nodeType == XML_TEXT_NODE) {
  207:     # split space at the beginning of the node
  208:     if ($before_node->nodeValue =~ /^(\s+)(.*?)$/s) {
  209:       my $space_node = $doc->createTextNode($1);
  210:       $before_node->setData($2);
  211:       $parent->insertBefore($space_node, $before_node);
  212:     }
  213:   }
  214:   my $startouttext = $doc->createElement('startouttext');
  215:   $parent->insertBefore($startouttext, $before_node);
  216: }
  217: 
  218: sub add_endouttext {
  219:   my ($parent, $before_node) = @_;
  220:   my $doc = $parent->ownerDocument;
  221:   my $endouttext = $doc->createElement('endouttext');
  222:   my $before_before;
  223:   if (defined $before_node) {
  224:     $before_before = $before_node->previousSibling;
  225:   } else {
  226:     $before_before = $parent->lastChild;
  227:   }
  228:   if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
  229:     # split space at the end of the node
  230:     if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
  231:       $before_before->setData($1);
  232:       my $space_node = $doc->createTextNode($2);
  233:       if (defined $before_node) {
  234:         $parent->insertBefore($space_node, $before_node);
  235:       } else {
  236:         $parent->appendChild($space_node);
  237:       }
  238:       $before_node = $space_node;
  239:     }
  240:   }
  241:   if (defined $before_node) {
  242:     $parent->insertBefore($endouttext, $before_node);
  243:   } else {
  244:     $parent->appendChild($endouttext);
  245:   }
  246: }
  247: 
  248: # Convert paragraph children when one contains an inline response into content + <br>
  249: # (the colorful editor does not support paragraphs containing inline responses)
  250: sub convert_paragraphs {
  251:   my ($parent) = @_;
  252:   my $p_child_with_inline_response = 0;
  253:   foreach my $child ($parent->childNodes) {
  254:     if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
  255:       foreach my $child2 ($child->childNodes) {
  256:         if ($child2->nodeType == XML_ELEMENT_NODE) {
  257:           if (string_in_array(\@inline_responses, $child2->nodeName)) {
  258:             $p_child_with_inline_response = 1;
  259:             last;
  260:           }
  261:         }
  262:       }
  263:     }
  264:     if ($p_child_with_inline_response) {
  265:       last;
  266:     }
  267:   }
  268:   if ($p_child_with_inline_response) {
  269:     my $doc = $parent->ownerDocument;
  270:     my $next;
  271:     for (my $child=$parent->firstChild; defined $child; $child=$next) {
  272:       $next = $child->nextSibling;
  273:       if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
  274:         replace_by_children($child);
  275:         if (defined $next && (defined $next->nextSibling || $next->nodeType != XML_TEXT_NODE ||
  276:             $next->nodeValue !~ /^\s*$/)) {
  277:           # we only add a br if there is something after
  278:           my $br = $doc->createElement('br');
  279:           $parent->insertBefore($br, $next);
  280:         }
  281:       }
  282:     }
  283:   }
  284: }
  285: 
  286: ##
  287: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
  288: # @param {Array<string>} array - reference to the array of strings
  289: # @param {string} value - the string to look for
  290: # @returns 1 if found, 0 otherwise
  291: ##
  292: sub string_in_array {
  293:   my ($array, $value) = @_;
  294:   foreach my $v (@{$array}) {
  295:     if ($v eq $value) {
  296:       return 1;
  297:     }
  298:   }
  299:   return 0;
  300: }
  301: 
  302: ##
  303: # replaces a node by its children
  304: # @param {Node} node - the DOM node
  305: ##
  306: sub replace_by_children {
  307:   my ($node) = @_;
  308:   my $parent = $node->parentNode;
  309:   my $next;
  310:   my $previous;
  311:   for (my $child=$node->firstChild; defined $child; $child=$next) {
  312:     $next = $child->nextSibling;
  313:     if ((!defined $previous || !defined $next) &&
  314:         $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
  315:       next; # do not keep first and last whitespace nodes
  316:     } else {
  317:       if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
  318:         # remove whitespace at the beginning
  319:         my $value = $child->nodeValue;
  320:         $value =~ s/^\s+//;
  321:         $child->setData($value);
  322:       }
  323:       if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
  324:         # and at the end
  325:         my $value = $child->nodeValue;
  326:         $value =~ s/\s+$//;
  327:         $child->setData($value);
  328:       }
  329:     }
  330:     $node->removeChild($child);
  331:     $parent->insertBefore($child, $node);
  332:     $previous = $child;
  333:   }
  334:   $parent->removeChild($node);
  335: }
  336: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>