Annotation of modules/damieng/clean_xml/xml_to_loncapa.pl, revision 1.2

1.1       damieng     1: #!/usr/bin/perl
                      2: 
                      3: # This takes a well-formed XML file as input, and converts it to LON-CAPA syntax.
                      4: 
                      5: use strict;
                      6: use utf8;
                      7: use warnings;
                      8: 
                      9: use XML::LibXML;
                     10: 
1.2     ! damieng    11: 
        !            12: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
        !            13: 
        !            14: my @loncapa_inline = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto'); # not textline
        !            15: 
        !            16: my @html_elements = ('html','meta','head','title','base','link','style','noscript','body','header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','img','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','label','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table'); # without script
        !            17: 
        !            18: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend' );
        !            19: 
        !            20: 
1.1       damieng    21: binmode(STDOUT, ':encoding(UTF-8)');
                     22: 
                     23: if (scalar(@ARGV) != 1) {
                     24:   print STDERR "Usage: perl xml_to_loncapa.pl file.xml\n";
                     25:   exit(1);
                     26: }
                     27: 
                     28: # find the command-line argument encoding
                     29: use I18N::Langinfo qw(langinfo CODESET);
                     30: my $codeset = langinfo(CODESET);
                     31: use Encode qw(decode);
                     32: @ARGV = map { decode $codeset, $_ } @ARGV;
                     33: 
                     34: my $pathname = "$ARGV[0]";
                     35: if (-f $pathname) {
                     36:   convert_file($pathname);
                     37: }
                     38: 
                     39: # Converts a file, creating a .loncapa file in the same directory.
                     40: # TODO: use the right extension based on content (or just ouput content)
                     41: sub convert_file {
                     42:   my ($pathname) = @_;
                     43: 
                     44:   # create a name for the new file
                     45:   my $newpath = $pathname.'.loncapa';
                     46: 
                     47:   print "converting $pathname...\n";
                     48:   
                     49:   my $dom_doc = XML::LibXML->load_xml(location => $pathname);
                     50:   
                     51:   open my $out, '>:encoding(UTF-8)', $newpath;
1.2     ! damieng    52:   add_outtext($dom_doc);
1.1       damieng    53:   print $out node_to_string($dom_doc);
                     54:   close $out;
                     55: }
                     56: 
                     57: sub node_to_string {
                     58:   my ($node) = @_;
                     59:   
                     60:   if ($node->nodeType == XML_DOCUMENT_NODE) {
                     61:     my $root = $node->documentElement();
                     62:     return node_to_string($root);
                     63:   } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
                     64:     my $parent = $node->parentNode;
                     65:     my $parent_name = $parent->nodeName;
                     66:     my $grandparent_name;
                     67:     if (defined $parent->parentNode) {
                     68:       $grandparent_name = $parent->parentNode->nodeName;
                     69:     }
                     70:     my @no_escape = ('m', 'script', 'display', 'parse', 'answer');
                     71:     if (string_in_array(\@no_escape, $parent_name) &&
                     72:         ($parent_name ne 'answer' ||
                     73:         (defined $grandparent_name &&
                     74:         $grandparent_name ne 'numericalresponse' &&
                     75:         $grandparent_name ne 'formularesponse'))) {
                     76:       return $node->nodeValue;
                     77:     } else {
                     78:       return $node->toString();
                     79:     }
                     80:   } elsif ($node->nodeType == XML_ELEMENT_NODE) {
                     81:     my $s = '';
                     82:     my $tag = $node->nodeName;
                     83:     $s .= "<$tag";
                     84:     my @attributes = $node->attributes();
                     85:     foreach my $attribute (@attributes) {
                     86:       $s .= ' ';
                     87:       $s .= $attribute->nodeName;
                     88:       $s .= '="';
                     89:       $s .= escape($attribute->nodeValue);
                     90:       $s .= '"';
                     91:     }
                     92:     if ($node->hasChildNodes()) {
                     93:       $s .= '>';
                     94:       foreach my $child ($node->childNodes) {
                     95:         $s .= node_to_string($child);
                     96:       }
                     97:       $s .= "</$tag>";
                     98:     } else {
                     99:       $s .= '/>';
                    100:     }
                    101:     return $s;
                    102:   } else {
                    103:     return $node->toString();
                    104:   }
                    105: }
                    106: 
                    107: # Escapes a string for LON-CAPA output (used for text nodes, not attribute values)
                    108: sub escape {
                    109:   my ($s) = @_;
                    110:   $s =~ s/&/&amp;/sg;
                    111:   $s =~ s/</&lt;/sg;
                    112:   $s =~ s/>/&gt;/sg;
                    113:   # quot and apos do not need to be escaped outside attribute values
                    114:   return $s;
                    115: }
                    116: 
1.2     ! damieng   117: # Adds startouttext and endouttext where useful for the colorfull editor
        !           118: sub add_outtext {
        !           119:   my ($node) = @_;
        !           120:   
        !           121:   if ($node->nodeType == XML_DOCUMENT_NODE) {
        !           122:     my $root = $node->documentElement();
        !           123:     add_outtext($root);
        !           124:     return;
        !           125:   }
        !           126:   if ($node->nodeType != XML_ELEMENT_NODE) {
        !           127:     return;
        !           128:   }
        !           129:   if (string_in_array(\@simple_data, $node->nodeName)) {
        !           130:     return;
        !           131:   }
        !           132:   my $next;
        !           133:   my $in_outtext = 0;
        !           134:   for (my $child=$node->firstChild; defined $child; $child=$next) {
        !           135:     $next = $child->nextSibling;
        !           136:     if (!$in_outtext && inside_outtext($child)) {
        !           137:       # Add startouttext
        !           138:       my $doc = $node->ownerDocument;
        !           139:       my $startouttext = $doc->createElement('startouttext');
        !           140:       $node->insertBefore($startouttext, $child);
        !           141:       $in_outtext = 1;
        !           142:     } elsif ($in_outtext && !continue_outtext($child)) {
        !           143:       # Add endouttext
        !           144:       add_endouttext($node, $child);
        !           145:       $in_outtext = 0;
        !           146:     }
        !           147:     if (!$in_outtext) {
        !           148:       add_outtext($child);
        !           149:     }
        !           150:   }
        !           151:   if ($in_outtext) {
        !           152:     # Add endouttext
        !           153:     add_endouttext($node);
        !           154:   }
        !           155: }
        !           156: 
        !           157: # Returns 1 if this node should trigger the addition of startouttext before it
        !           158: sub inside_outtext {
        !           159:   my ($node) = @_;
        !           160:   if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
        !           161:     return 1;
        !           162:   }
        !           163:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_elements, $node->nodeName)) {
        !           164:     if (contains_loncapa_block($node)) {
        !           165:       return 0;
        !           166:     } else {
        !           167:       return 1;
        !           168:     }
        !           169:   }
        !           170:   if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_inline, $node->nodeName)) {
        !           171:     return 1;
        !           172:   }
        !           173:   return 0;
        !           174: }
        !           175: 
        !           176: # Returns 1 if the outtext environment can continue with this node
        !           177: sub continue_outtext {
        !           178:   my ($node) = @_;
        !           179:   if (inside_outtext($node)) {
        !           180:     return 1;
        !           181:   }
        !           182:   if ($node->nodeType == XML_TEXT_NODE) {
        !           183:     return 1; # continue even if this is just spaces
        !           184:   }
        !           185:   return 0;
        !           186: }
        !           187: 
        !           188: # Returns 1 if the node contains a LON-CAPA block in a descendant.
        !           189: sub contains_loncapa_block {
        !           190:   my ($node) = @_;
        !           191:   foreach my $child ($node->childNodes) {
        !           192:     if ($child->nodeType == XML_ELEMENT_NODE) {
        !           193:       if (string_in_array(\@loncapa_block, $child->nodeName)) {
        !           194:         return 1;
        !           195:       }
        !           196:       if (contains_loncapa_block($child)) {
        !           197:         return 1;
        !           198:       }
        !           199:     }
        !           200:   }
        !           201:   return 0;
        !           202: }
        !           203: 
        !           204: sub add_endouttext {
        !           205:   my ($parent, $before_node) = @_;
        !           206:   my $doc = $parent->ownerDocument;
        !           207:   my $endouttext = $doc->createElement('endouttext');
        !           208:   my $before_before;
        !           209:   if (defined $before_node) {
        !           210:     $before_before = $before_node->previousSibling;
        !           211:   } else {
        !           212:     $before_before = $parent->lastChild;
        !           213:   }
        !           214:   if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
        !           215:     # split space at the end of the node before endouttext
        !           216:     if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
        !           217:       $before_before->setData($1);
        !           218:       my $space_node = $doc->createTextNode($2);
        !           219:       if (defined $before_node) {
        !           220:         $parent->insertBefore($space_node, $before_node);
        !           221:       } else {
        !           222:         $parent->appendChild($space_node);
        !           223:       }
        !           224:       $before_node = $space_node;
        !           225:     }
        !           226:   }
        !           227:   if (defined $before_node) {
        !           228:     $parent->insertBefore($endouttext, $before_node);
        !           229:   } else {
        !           230:     $parent->appendChild($endouttext);
        !           231:   }
        !           232: }
        !           233: 
1.1       damieng   234: ##
                    235: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
                    236: # @param {Array<string>} array - reference to the array of strings
                    237: # @param {string} value - the string to look for
                    238: # @returns 1 if found, 0 otherwise
                    239: ##
                    240: sub string_in_array {
                    241:   my ($array, $value) = @_;
                    242:   foreach my $v (@{$array}) {
                    243:     if ($v eq $value) {
                    244:       return 1;
                    245:     }
                    246:   }
                    247:   return 0;
                    248: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>