File:  [LON-CAPA] / modules / damieng / clean_xml / xml_to_loncapa.pl
Revision 1.2: download - view: text, annotated - select for diffs
Sat May 16 20:17:13 2015 UTC (9 years ago) by damieng
Branches: MAIN
CVS tags: HEAD
adding startouttext/endouttext for colorful editor compatibility

    1: #!/usr/bin/perl
    2: 
    3: # This takes a well-formed XML file as input, and converts it to LON-CAPA syntax.
    4: 
    5: use strict;
    6: use utf8;
    7: use warnings;
    8: 
    9: use XML::LibXML;
   10: 
   11: 
   12: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
   13: 
   14: my @loncapa_inline = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto'); # not textline
   15: 
   16: my @html_elements = ('html','meta','head','title','base','link','style','noscript','body','header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','img','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','label','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table'); # without script
   17: 
   18: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend' );
   19: 
   20: 
   21: binmode(STDOUT, ':encoding(UTF-8)');
   22: 
   23: if (scalar(@ARGV) != 1) {
   24:   print STDERR "Usage: perl xml_to_loncapa.pl file.xml\n";
   25:   exit(1);
   26: }
   27: 
   28: # find the command-line argument encoding
   29: use I18N::Langinfo qw(langinfo CODESET);
   30: my $codeset = langinfo(CODESET);
   31: use Encode qw(decode);
   32: @ARGV = map { decode $codeset, $_ } @ARGV;
   33: 
   34: my $pathname = "$ARGV[0]";
   35: if (-f $pathname) {
   36:   convert_file($pathname);
   37: }
   38: 
   39: # Converts a file, creating a .loncapa file in the same directory.
   40: # TODO: use the right extension based on content (or just ouput content)
   41: sub convert_file {
   42:   my ($pathname) = @_;
   43: 
   44:   # create a name for the new file
   45:   my $newpath = $pathname.'.loncapa';
   46: 
   47:   print "converting $pathname...\n";
   48:   
   49:   my $dom_doc = XML::LibXML->load_xml(location => $pathname);
   50:   
   51:   open my $out, '>:encoding(UTF-8)', $newpath;
   52:   add_outtext($dom_doc);
   53:   print $out node_to_string($dom_doc);
   54:   close $out;
   55: }
   56: 
   57: sub node_to_string {
   58:   my ($node) = @_;
   59:   
   60:   if ($node->nodeType == XML_DOCUMENT_NODE) {
   61:     my $root = $node->documentElement();
   62:     return node_to_string($root);
   63:   } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
   64:     my $parent = $node->parentNode;
   65:     my $parent_name = $parent->nodeName;
   66:     my $grandparent_name;
   67:     if (defined $parent->parentNode) {
   68:       $grandparent_name = $parent->parentNode->nodeName;
   69:     }
   70:     my @no_escape = ('m', 'script', 'display', 'parse', 'answer');
   71:     if (string_in_array(\@no_escape, $parent_name) &&
   72:         ($parent_name ne 'answer' ||
   73:         (defined $grandparent_name &&
   74:         $grandparent_name ne 'numericalresponse' &&
   75:         $grandparent_name ne 'formularesponse'))) {
   76:       return $node->nodeValue;
   77:     } else {
   78:       return $node->toString();
   79:     }
   80:   } elsif ($node->nodeType == XML_ELEMENT_NODE) {
   81:     my $s = '';
   82:     my $tag = $node->nodeName;
   83:     $s .= "<$tag";
   84:     my @attributes = $node->attributes();
   85:     foreach my $attribute (@attributes) {
   86:       $s .= ' ';
   87:       $s .= $attribute->nodeName;
   88:       $s .= '="';
   89:       $s .= escape($attribute->nodeValue);
   90:       $s .= '"';
   91:     }
   92:     if ($node->hasChildNodes()) {
   93:       $s .= '>';
   94:       foreach my $child ($node->childNodes) {
   95:         $s .= node_to_string($child);
   96:       }
   97:       $s .= "</$tag>";
   98:     } else {
   99:       $s .= '/>';
  100:     }
  101:     return $s;
  102:   } else {
  103:     return $node->toString();
  104:   }
  105: }
  106: 
  107: # Escapes a string for LON-CAPA output (used for text nodes, not attribute values)
  108: sub escape {
  109:   my ($s) = @_;
  110:   $s =~ s/&/&amp;/sg;
  111:   $s =~ s/</&lt;/sg;
  112:   $s =~ s/>/&gt;/sg;
  113:   # quot and apos do not need to be escaped outside attribute values
  114:   return $s;
  115: }
  116: 
  117: # Adds startouttext and endouttext where useful for the colorfull editor
  118: sub add_outtext {
  119:   my ($node) = @_;
  120:   
  121:   if ($node->nodeType == XML_DOCUMENT_NODE) {
  122:     my $root = $node->documentElement();
  123:     add_outtext($root);
  124:     return;
  125:   }
  126:   if ($node->nodeType != XML_ELEMENT_NODE) {
  127:     return;
  128:   }
  129:   if (string_in_array(\@simple_data, $node->nodeName)) {
  130:     return;
  131:   }
  132:   my $next;
  133:   my $in_outtext = 0;
  134:   for (my $child=$node->firstChild; defined $child; $child=$next) {
  135:     $next = $child->nextSibling;
  136:     if (!$in_outtext && inside_outtext($child)) {
  137:       # Add startouttext
  138:       my $doc = $node->ownerDocument;
  139:       my $startouttext = $doc->createElement('startouttext');
  140:       $node->insertBefore($startouttext, $child);
  141:       $in_outtext = 1;
  142:     } elsif ($in_outtext && !continue_outtext($child)) {
  143:       # Add endouttext
  144:       add_endouttext($node, $child);
  145:       $in_outtext = 0;
  146:     }
  147:     if (!$in_outtext) {
  148:       add_outtext($child);
  149:     }
  150:   }
  151:   if ($in_outtext) {
  152:     # Add endouttext
  153:     add_endouttext($node);
  154:   }
  155: }
  156: 
  157: # Returns 1 if this node should trigger the addition of startouttext before it
  158: sub inside_outtext {
  159:   my ($node) = @_;
  160:   if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
  161:     return 1;
  162:   }
  163:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_elements, $node->nodeName)) {
  164:     if (contains_loncapa_block($node)) {
  165:       return 0;
  166:     } else {
  167:       return 1;
  168:     }
  169:   }
  170:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_inline, $node->nodeName)) {
  171:     return 1;
  172:   }
  173:   return 0;
  174: }
  175: 
  176: # Returns 1 if the outtext environment can continue with this node
  177: sub continue_outtext {
  178:   my ($node) = @_;
  179:   if (inside_outtext($node)) {
  180:     return 1;
  181:   }
  182:   if ($node->nodeType == XML_TEXT_NODE) {
  183:     return 1; # continue even if this is just spaces
  184:   }
  185:   return 0;
  186: }
  187: 
  188: # Returns 1 if the node contains a LON-CAPA block in a descendant.
  189: sub contains_loncapa_block {
  190:   my ($node) = @_;
  191:   foreach my $child ($node->childNodes) {
  192:     if ($child->nodeType == XML_ELEMENT_NODE) {
  193:       if (string_in_array(\@loncapa_block, $child->nodeName)) {
  194:         return 1;
  195:       }
  196:       if (contains_loncapa_block($child)) {
  197:         return 1;
  198:       }
  199:     }
  200:   }
  201:   return 0;
  202: }
  203: 
  204: sub add_endouttext {
  205:   my ($parent, $before_node) = @_;
  206:   my $doc = $parent->ownerDocument;
  207:   my $endouttext = $doc->createElement('endouttext');
  208:   my $before_before;
  209:   if (defined $before_node) {
  210:     $before_before = $before_node->previousSibling;
  211:   } else {
  212:     $before_before = $parent->lastChild;
  213:   }
  214:   if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
  215:     # split space at the end of the node before endouttext
  216:     if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
  217:       $before_before->setData($1);
  218:       my $space_node = $doc->createTextNode($2);
  219:       if (defined $before_node) {
  220:         $parent->insertBefore($space_node, $before_node);
  221:       } else {
  222:         $parent->appendChild($space_node);
  223:       }
  224:       $before_node = $space_node;
  225:     }
  226:   }
  227:   if (defined $before_node) {
  228:     $parent->insertBefore($endouttext, $before_node);
  229:   } else {
  230:     $parent->appendChild($endouttext);
  231:   }
  232: }
  233: 
  234: ##
  235: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
  236: # @param {Array<string>} array - reference to the array of strings
  237: # @param {string} value - the string to look for
  238: # @returns 1 if found, 0 otherwise
  239: ##
  240: sub string_in_array {
  241:   my ($array, $value) = @_;
  242:   foreach my $v (@{$array}) {
  243:     if ($v eq $value) {
  244:       return 1;
  245:     }
  246:   }
  247:   return 0;
  248: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>