--- loncom/homework/cleanxml/xml_to_loncapa.pm 2015/12/23 20:33:10 1.3 +++ loncom/homework/cleanxml/xml_to_loncapa.pm 2016/01/20 00:41:31 1.7 @@ -1,7 +1,7 @@ # The LearningOnline Network # convert_file takes a well-formed XML file content and converts it to LON-CAPA syntax. # -# $Id: xml_to_loncapa.pm,v 1.3 2015/12/23 20:33:10 damieng Exp $ +# $Id: xml_to_loncapa.pm,v 1.7 2016/01/20 00:41:31 damieng Exp $ # # Copyright Michigan State University Board of Trustees # @@ -49,13 +49,21 @@ my @simple_data = ('polygon', 'rectangle my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse'); +# see http://www.w3.org/TR/html-polyglot/#empty-elements +# and http://tiffanybbrown.com/2011/03/23/html5-does-not-allow-self-closing-tags/ +# HTML elements that do not have an empty content, and must never use a self-closing tag: +my @non_empty_html = ('title','style','script','noscript','body','section','header','footer','article','aside','nav','h1','h2','h3','h4','h5','h6','div','p','li','dt','dd','caption','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','address','blockquote','bdo','ruby','rb','rp','rt','rtc','figure','figcaption','object','applet','video','audio','canvas','label','option','textarea','fieldset','legend','button','iframe'); + # Converts a file and return the modified contents sub convert_file { my ($contents) = @_; my $dom_doc = XML::LibXML->load_xml(string => $contents); - add_outtext($dom_doc); + my $root = $dom_doc->documentElement(); + if (defined $root && $root->nodeName ne 'html') { + add_outtext($dom_doc); + } return node_to_string($dom_doc); } @@ -73,7 +81,7 @@ sub node_to_string { if (defined $parent->parentNode) { $grandparent_name = $parent->parentNode->nodeName; } - my @no_escape = ('m', 'script', 'display', 'parse', 'answer'); + my @no_escape = ('m', 'script', 'style', 'display', 'parse', 'answer'); if (string_in_array(\@no_escape, $parent_name) && ($parent_name ne 'answer' || (defined $grandparent_name && @@ -95,7 +103,7 @@ sub node_to_string { $s .= escape_attribute($attribute->nodeValue); $s .= '"'; } - if ($node->hasChildNodes() || $tag eq 'script') { + if ($node->hasChildNodes() || string_in_array(\@non_empty_html, $tag)) { $s .= '>'; foreach my $child ($node->childNodes) { $s .= node_to_string($child); @@ -247,6 +255,21 @@ sub add_endouttext { } else { $parent->appendChild($endouttext); } + # replace spaces afterwards by a \n + indentation + my $next = $endouttext->nextSibling; + if (defined $next && $next->nodeType == XML_TEXT_NODE) { + my $v = $next->nodeValue; + if ($v =~ /^ /) { + $v =~ s/^ +//; + if ($parent->firstChild->nodeType == XML_TEXT_NODE && + $parent->firstChild->nodeValue =~ /^\n +$/) { + $v = $parent->firstChild->nodeValue.$v; + } else { + $v = "\n".$v; + } + $next->setData($v); + } + } } # Convert paragraph children when one contains an inline response into content +