# The LearningOnline Network with CAPA # gateway for html input/output to be properly parsed and handled # # $Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # # This file is part of the LearningOnline Network with CAPA (LON-CAPA). # # LON-CAPA is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # LON-CAPA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with LON-CAPA; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # /home/httpd/html/adm/gpl.txt # # http://www.lon-capa.org/ # ###################################################################### ###################################################################### =pod =head1 NAME Apache::lonhtmlgateway - properly parse and handle HTML input and output =head1 SYNOPSIS This is part of the LearningOnline Network with CAPA project described at http://www.lon-capa.org. =head1 INTRODUCTION lonhtmlgateway is an object-oriented module used to parse and correct malformed HTML input from the client, as well as to perform processing of custom LON-CAPA HTML output before it is sent along to the end-user. It replaces a number of subroutines in various modules, and adds new code to tidy and process malformed HTML using XML::LibXML. This module is intended to be used for all non-authoring perspectives in the system. New to LON-CAPA version 3.0. =head2 Example Usage Below is intended code to be invoked and called for use outside of this module: $gateway = Apache::lonhtmlgateway->new(); $gateway = Apache::lonhtmlgateway->new($target); $xhtml = $gateway->process_incoming_html($html); $xhtml = $gateway->process_incoming_html($html, $legacy); $xml = $gateway->process_html_to_xml($html); $xhtml = $gateway->process_xml_to_html($xml); $bool = Apache::lonhtmlgateway->contains_block_level_tags($input); =head1 GLOBAL VARIABLES =over 4 =cut ###################################################################### ###################################################################### package Apache::lonhtmlgateway; use strict; use utf8; use Time::Local; use Time::HiRes; use Apache::lonlocal; use Apache::lonnet; use Apache::lonhtmlcommon; use Apache::lonxml; use Apache::lontexconvert; use lib '/home/httpd/lib/perl/'; use LONCAPA; use XML::LibXML; use Encode; use HTML::Entities; use HTML::LCParser(); use Safe(); local $XML::LibXML::skipXMLDeclaration = 1; local $XML::LibXML::skipDTD = 1; local $XML::LibXML::setTagCompression = 1; ############################################## ############################################## =item %LONCAPA_ALLOWED_STANDARD_TAGS This is a hash of all tags, both HTML and custom LON-CAPA tags that are allowed in non-authoring spaces. Examples of this include course documents, bulletin boards, discussion posts, templated pages, etc. In addition, in the event of rich text editing, the WYSIWYG editor needs to know how to display LON-CAPA custom tags as either inline-level () or block-level (
). Therefore, the hash is set up with uppercase tag names as keys ("H1"), and the corresponding entry an integer constant indicating that tag's role or purpose: =over 4 =item 0 = Tag is explictly not allowed. Currently not used anywhere in this module, but reserved for the future in case certain tags would like to be explicitly blacklisted. =item 1 = Tag is allowed, and in cases where it is unclear, is rendered as an inline-level element. Example: should be rendered as an inline element. =item 2 = Tag is allowed, and in cases where it is unclear, is rendered as a block-level element. Example: should be rendered as a block element. =back =back =cut ############################################## ############################################## our %LONCAPA_ALLOWED_STANDARD_TAGS = ( # standard html header tags H1 => 2, H2 => 2, H3 => 2, H4 => 2, H5 => 2, H6 => 2, # basic inline formatting and phrases B => 1, I => 1, U => 1, STRONG => 1, EM => 1, STRIKE => 1, BIG => 1, SMALL => 1, INS => 1, DEL => 1, S => 1, Q => 1, DFN => 1, CODE => 1, SAMP => 1, KBD => 1, VAR => 1, SUB => 1, SUP => 1, # linking and embedding A => 1, IMG => 1, # block level tags P => 2, DIV => 2, OL => 2, UL => 2, LI => 2, ADDRESS => 2, BR => 2, HR => 2, BLOCKQUOTE => 2, PRE => 2, # table-related tags TABLE => 2, CAPTION => 2, TBODY => 2, TR => 2, TD => 2, TH => 2, # LON-CAPA custom tags M => 1, MI => 1, MD => 2, ALGEBRA => 1, CHEM => 1 ); ############################################## ############################################## =head1 PARSING LON-CAPA CUSTOM TAGS This module maintains a hash %custom_tag_parsers, containing lowercase tag names as keys and function references as entries. Convention used here names the actual parsing function whose reference is stored here to be of the name &parse_tagname_tag(). These functions are called during the processing of outgoing HTML output in the &process_outgoing_html() function. Each of these functions is passed the following arguments: =over 4 =item self Reference to Apache::lonhtmlgateway object calling the function. =item input Textual context extracted between the and tags. Note that this text I contain HTML entities. Thus, for functions that cannot handle entitized input, &HTML::Entities::decode_entities() should be called on this data before further handing it off. =back Example hash entry: mi => \&parse_mi_tag, =head2 Currently Supported Custom Tags =over 4 =item Intended to convert and simplify simple algebraic functions into readable output. Corrects cases such as double negatives or eliminates coefficients of 1 where appropriate. The actual handling of content contained in this tag takes place inside L, which in turn uses the AlgParser module to actually process the input. Usage: 2*x+(-5) =item Formatter for chemical equations, adding superscripts, subscripts, and appropriate arrow characters as appropriate. This parser is wholly contained inside this module, but is a copy of a routine found in homework/default_homework.lcpm. Usage: CH3CO2H + H2O <=> CH3CO2- + H3O+ =back =head3 Math Mode Tags These tags are intended for LaTeX math mode input, in order to produce complex mathematical and scientific constructs, which normal HTML cannot produce. The output is later rendered by a user-defined TeX engine in web target, or handled directly in the case of tex target. The only difference between the tags below is determining the author's intent on how to appropriately render the contents within the tag - this intent is important in preserving the What You See Is What You Get philosophy of the rich text editor. =over 4 =item Inline math mode tag. Content is surrounded by "$" characters and passed to the parser for the tag. I. =item Display block math mode tag. Content is surrounded by "\[" and "\]" characters and passed to the parser for the tag. I. =item Math mode tag. Allows author to fully specify the display of their TeX input, and contain mixed inline-and-block content within a single tag. Due to tools such as the rich text editor needing to know whether a custom tag is block-level or inline-level on render, the use of this tag is discouraged starting with LON-CAPA 3.0 although it will continue to function. Fully compatible with legacy LON-CAPA 2.x content. =back =cut ############################################## ############################################## my %custom_tag_parsers = ( mi => \&parse_mi_tag, md => \&parse_md_tag, m => \&parse_m_tag, algebra => \&parse_algebra_tag, chem => \&parse_chem_tag ); ############################################## ############################################## =head1 CLASS OBJECT CONSTRUCTOR =over 4 =item new $gateway = Apache::libhtmlgateway->new(); $gateway = Apache::libhtmlgateway->new($target); Constructs and returns a new gateway object. An optional argument allows one to specify the target of the output, defaults to 'web'. Behind the scenes, a single XML::LibXML parser object is created behind the scenes. On destroy, this parser object is destroyed as well. =back =cut ############################################## ############################################## sub new { my $invocant = shift; my $class = ref($invocant) || $invocant; my $target = shift; # create a new parser instance for libxml my $self = { parser => XML::LibXML->new(), target => ($target) ? $target : 'web' }; # options for the libxml parser $self->{parser}->recover(1); $self->{parser}->recover_silently(1); bless($self, $class); # bless = pray that it works return $self; } sub DESTROY { my $self = shift; my $parser = $self->{parser}; undef $parser; # destroy the parser instance } ############################################## ############################################## =head1 PUBLIC OBJECT METHODS =over 4 =item process_html_to_xml $xml = $gateway->process_html_to_xml($html); Takes presumably-malformed HTML, encodes ampersands characters and passes the result to the Xml::LibXML parser, which creates a DOM tree in memory of the content. This parse is as error-tolerant as can be set, and libxml attempts to recover from any errors as much as possible. This DOM tree is then taken and serialized, eliminating unbalanced and malformed tags along the way. This XML code (without any header tags) is then returned to the caller. =cut ############################################## ############################################## sub process_html_to_xml { my $self = shift; my $input = shift; my $parser = $self->{parser}; if (length($input) < 1) { return ""; } # only encode ampersands -- brackets may be valid tags my $encoded = &HTML::Entities::encode_entities($input, '&'); # for the tag, we want the strings "<=>", "<-", "->" to be properly # entitized so the parser doesn't destroy it $encoded =~ s/(\<\s*chem\s*>.*)\<\=\>(.*\<\s*\/chem\s*>)/$1\<\;\&\#61\;\>\;$2/gi; $encoded =~ s/(\<\s*chem\s*>.*)\-\>(.*\<\s*\/chem\s*>)/$1\-\>\;$2/gi; $encoded =~ s/(\<\s*chem\s*>.*)\<\-(.*\<\s*\/chem\s*>)/$1\<\;\-$2/gi; # parse into libXML to tidy tags, we suppress any errors # because otherwise the parser complains about non-HTML # tags to STDERR and the Apache error logs my $dom = $parser->parse_html_string($encoded, { suppress_errors => 1, suppress_warnings => 1, recover => 2 } ); # the dom returns a full structure, so just get # all the child nodes of the tag and put them together my @body_nodes = $dom->findnodes('/html/body'); my @body_children = $body_nodes[0]->childNodes; my $xml = ""; foreach my $child (@body_children) { $xml .= $child->toString(); } # entities passed into $input are in the form of '&lt;' # they are double entities return $xml; } ############################################## ############################################## =item process_xml_to_html $xhtml = $gateway->process_xml_to_html($xml); Takes XML input, decodes ampersands characters and passes the result then to the caller. =cut ############################################## ############################################## sub process_xml_to_html { my $self = shift; my $input = shift; # decode one level of entities (XML) such that the # output is returned to the original level of entities # $input "<" --> $xml "&lt;" --> "<" my $xhtml = &HTML::Entities::decode_entities($input); # now we have valid XHTML that can be stored and parsed return $xhtml; } ############################################## ############################################## =item process_incoming_html $xhtml = $gateway->process_incoming_html($html); $xhtml = $gateway->process_incoming_html($html, $legacy); Designed to be called for all raw HTML inputs from the client side before storing or rendering data. Decodes UTF-8 data, trims leading and trailing "\n" and "
" tags. Processes the result through the XML parser, converts this back to balanced well-formed XHTML, re-encodes the result as UTF-8, and returns the result to the caller. =over 4 =item legacy $legacy = 0; $legacy = 1; I<(optional)> If true, adds additional processing intended to emulate LON-CAPA 2.x parsing of the content. =back =cut ############################################## ############################################## sub process_incoming_html { # this should be called by all HTML inputs before storing # data --> for consistency's sake, call process_html_to_xml # afterwards if you need to embed this in XML later on my $self = shift; my $input = shift; my $legacy = shift; # no idea why i have to call this to get unicode characters # working, but i do, so here it is. $input = &Encode::decode_utf8($input); # trim leading and trailing whitespace and HTML breaks chomp($input); $input =~ s/\s+$//s; $input =~ s/^\s+//s; $input =~ s/\$//s; my $no_p_input = (length($input) > 0 && $input !~ m/.*\<[\s]*p[\s]*\>.*/is); my $xml = $self->process_html_to_xml($input); if ($legacy && !&contains_block_level_tags($input)) { # the xml returns content inside a

tag # if there are no block tags... thus to preserve # old behavior, we strip out that

if ($no_p_input) { $xml =~ s/^\(.*)\<\/p\>/$1/si; } } my $xhtml = $self->process_xml_to_html($xml); # see above unicode encoding comment $xhtml = &Encode::encode_utf8($xhtml); return $xhtml; } ############################################## ############################################## =item process_outgoing_html $html = $gateway->process_outgoing_html($xhtml); $html = $gateway->process_outgoing_html($xhtml, $legacy); Designed to be called for all HTML outputs to the client side before rendering data. This entitizes all non-allowed tags, as was previously done in Apache::lonfeedback, and processes and converts all LON-CAPA supported custom tags (see above) to their respective output HTML. =over 4 =item legacy $legacy = 0; $legacy = 1; I<(optional)> If true, adds additional processing intended to emulate LON-CAPA 2.x parsing of the content. This includes behavior to convert "\n" to "
" if there are no block-level tags detected in the input. In addition, raw URLs are converted automatically to links. =back =back =cut ############################################## ############################################## sub process_outgoing_html { # this should be called on all HTML outputs before displaying # because it will filter out all non-HTML+LONCAPA tags. # tags are not filtered at input stage for greater backwards # compatibility. note that this disregards course preference. my $self = shift; my $input = shift; my $legacy = shift; my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS; # entitize all tags that are not explicitly allowed $input =~ s/\<(\/?\s*(\w+)[^\>\<]*)/ {($html{uc($2)}&&(length($1)<1000))?"\<$1":"\<$1"}/ge; $input =~ s/(\]*)\>/ {($html{uc($2)}&&(length($1)<1000))?"$1\>":"$1\>"}/ge; if ($legacy) { unless (&contains_block_level_tags($self, $input)) { $input = $self->legacy_newline_to_br($input); } $input = $self->legacy_raw_href_to_link($input); } # at this point, we need to convert our own custom tags # into the appropriate output # see above for supported tags my $output = ""; my $parser = HTML::LCParser->new(\$input); while (my $token = $parser->get_token()) { if ($token->[0] eq 'T') { if ($self->{target} ne 'tex') { $output .= &Apache::lontexconvert::smiley($token->[1]); } else { my $t = $token->[1]; $t =~ s/([^\n\r\t &<>!\#%\(-;=?-~])/num_entity($1)/ge; $output .= $t; } } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') { $output .= $token->[1]; } elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') { $output .= $token->[2]; } elsif ($token->[0] eq 'S') { my $tag = lc($token->[1]); if (exists($custom_tag_parsers{$tag})) { my $text = $parser->get_text(); $output .= $custom_tag_parsers{$tag}( $self, $text, $self->{target}); } else { $output .= $token->[4]; } } } return $output; } ############################################## ############################################## =head1 STATIC CLASS METHODS The following are static class methods that can be called by any object. =over 4 =item contains_block_level_tags $bool = Apache::lonhtmlgateway::contains_block_level_tags($input); Uses a regular expression to find, in the input data, any tags described in %LONCAPA_ALLOWED_STANDARD_TAGS as block-level. Returns 1 if true, 0 if false. =cut ############################################## ############################################## sub contains_block_level_tags { my $class = shift; my $input = shift; my @block_level_tags = @{&get_block_level_tags($class)}; foreach my $tag (@block_level_tags) { if ($input =~ m/\<\/?\s*$tag[^\>\<]*/gi) { # if your input loves this regular expression # as much as i do, then return true. # it searches for either a or return 1; } } return 0; } ############################################## ############################################## =item get_block_level_tags @tags = Apache::lonhtmlgateway::get_block_level_tags(); Return an array with any tags described in %LONCAPA_ALLOWED_STANDARD_TAGS as block-level. Note that these tags are returned in no particular order, and the tag names are returned in uppercase. =cut ############################################## ############################################## sub get_block_level_tags { my $class = shift; my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS; my @block = []; foreach my $tag (keys(%html)) { if ($html{$tag} == 2) { push(@block, $tag); } } return \@block; } sub num_entity { sprintf "&#x%X;", ord($_[0]); } ############################################## ############################################## =head2 Legacy Functions These functions are intended to process input in the same or a similar way to how it was processed in LON-CAPA 2.x. =item legacy_newline_to_br I<(formerly Apache::lonfeedback::newline_to_br)> $converted = Apache::lonhtmlgateway::legacy_newline_to_br($input); Parse the input using HTML::LCParser, and in any text nodes which contain "\n" characters, replace those characters with an HTML "
" tag. =cut ############################################## ############################################## sub legacy_newline_to_br { my $class = shift; my $input = shift; my $output; my $parser = HTML::LCParser->new(\$input); while (my $token = $parser->get_token()) { if ($token->[0] eq 'T') { my $text = $token->[1]; $text =~ s/\n/\
/g; $output .= $text; } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') { $output .= $token->[1]; } elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') { $output .= $token->[2]; } elsif ($token->[0] eq 'S') { $output .= $token->[4]; } } return $output; } ############################################## ############################################## =item legacy_raw_href_to_link I<(formerly Apache::lonhtmlcommon::raw_href_to_link)> $converted = Apache::lonhtmlgateway::legacy_raw_href_to_link($input); Search for any links/URLs within the input text, and convert them to
tags whose content is embedded inside a tag. =back =cut ############################################## ############################################## sub legacy_raw_href_to_link { my $class = shift; my $input = shift; $input =~ s/(https?\:\/\/[^\s\'\"\<]+)([\s\<]|$)/$1<\/tt><\/a>$2/gi; return $input; } sub parse_algebra_tag { my $self = shift; my $input = shift; # the parser does NOT handle entities, # unlike the general parser; thus we run # the content of this tag through HTML::Entities, # decoding it first. we also just get the tex, and # feed it through as if it were an tag. $input = &HTML::Entities::decode($input); my $algebra = &Apache::lontexconvert::algebra($input,'tex',undef,undef,undef,'tth'); return &parse_m_tag($self, $algebra); } sub parse_mi_tag { my $self = shift; my $input = shift; return &parse_m_tag($self, '\ensuremath{'.$input.'}'); } sub parse_md_tag { my $self = shift; my $input = shift; return &parse_m_tag($self, '\['.$input.'\]'); } sub parse_m_tag { my $self = shift; my $input = shift; if ($self->{target} ne 'tex') { return &Apache::lontexconvert::to_convert($input, $self->{target}); } else { return ''.$input.''; } } sub parse_chem_tag { my $self = shift; my $input = shift; my $target = $self->{target}; # as with the tag, some portions of the # input may be coming in encoded, especially # arrows -- so decode it in HTML::Entities $input = &HTML::Entities::decode($input); my @tokens = split(/(\s\+|\->|<=>|<\-|\.)/,$input); my $formula = ''; foreach my $token (@tokens) { if ($token eq '->' ) { if ($target eq 'web') { $formula .= '→ '; } else { $formula .= '\ensuremath{\rightarrow} '; } next; } if ($token eq '<-' ) { if ($target eq 'web') { $formula .= '← '; } else { $formula .= '\ensuremath{\leftarrow} '; } next; } if ($token eq '<=>') { if ($target eq 'web') { $formula .= '⇌ '; } else { $formula .= '\ensuremath{\rightleftharpoons} '; } next; } if ($token eq '.') { $formula =~ s/(\ \;| )$//; $formula .= '·'; next; } $token =~ /^\s*([\d|\/]*(?:&frac\d\d)?)(.*)/; $formula .= $1 if ($1 ne '1'); # stoichiometric coefficient my $molecule = $2; # subscripts $molecule =~ s|(?<=[a-zA-Z\)\]\s])(\d+)|$1|g; # superscripts $molecule =~ s|\^(\d*[+\-]*)|$1|g; # strip whitespace $molecule =~ s/\s*//g; # forced space $molecule =~ s/_/ /g; $molecule =~ s/-/−/g; $formula .= $molecule.' '; } # get rid of trailing space $formula =~ s/(\ \;| )$//; return $formula; } ############################################## ############################################## =head1 AUTHORS Phil Fazio =head1 VERSION $Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $ =cut ############################################## ##############################################