File:  [LON-CAPA] / loncom / interface / lonhtmlgateway.pm
Revision 1.5: download - view: text, annotated - select for diffs
Mon May 24 23:47:22 2010 UTC (13 years, 11 months ago) by raeburn
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_X, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, version_2_11_2_uiuc, version_2_11_2_msu, version_2_11_2_educog, version_2_11_2, version_2_11_1, version_2_11_0_RC3, version_2_11_0_RC2, version_2_11_0, language_hyphenation_merge, language_hyphenation, HEAD, BZ4492-merge, BZ4492-feature_horizontal_radioresponse
- 'tth' is sixth arg in lontexconvert::algebra().

    1: # The LearningOnline Network with CAPA
    2: # gateway for html input/output to be properly parsed and handled
    3: #
    4: # $Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ######################################################################
   29: ######################################################################
   30: 
   31: =pod
   32: 
   33: =head1 NAME
   34: 
   35: Apache::lonhtmlgateway - properly parse and handle HTML input and output
   36: 
   37: =head1 SYNOPSIS
   38: 
   39: This is part of the LearningOnline Network with CAPA project
   40: described at http://www.lon-capa.org.
   41: 
   42: =head1 INTRODUCTION
   43: 
   44: lonhtmlgateway is an object-oriented module used to parse and correct
   45: malformed HTML input from the client, as well as to perform processing
   46: of custom LON-CAPA HTML output before it is sent along to the end-user.
   47: It replaces a number of subroutines in various modules, and adds new
   48: code to tidy and process malformed HTML using XML::LibXML. 
   49: 
   50: This module is intended to be used for all non-authoring perspectives
   51: in the system.
   52: 
   53: New to LON-CAPA version 3.0.
   54: 
   55: =head2 Example Usage
   56: 
   57: Below is intended code to be invoked and called for use outside 
   58: of this module:
   59: 
   60:     $gateway = Apache::lonhtmlgateway->new();
   61:     $gateway = Apache::lonhtmlgateway->new($target);
   62:     
   63:     $xhtml = $gateway->process_incoming_html($html);
   64:     $xhtml = $gateway->process_incoming_html($html, $legacy);
   65:     
   66:     $xml = $gateway->process_html_to_xml($html);
   67:     $xhtml = $gateway->process_xml_to_html($xml); 
   68:     
   69:     $bool = Apache::lonhtmlgateway->contains_block_level_tags($input);
   70:     
   71: =head1 GLOBAL VARIABLES
   72: 
   73: =over 4
   74: 
   75: =cut 
   76: 
   77: ######################################################################
   78: ######################################################################
   79: 
   80: package Apache::lonhtmlgateway;
   81: 
   82: use strict;
   83: use utf8;
   84: use Time::Local;
   85: use Time::HiRes;
   86: use Apache::lonlocal;
   87: use Apache::lonnet;
   88: use Apache::lonhtmlcommon;
   89: use Apache::lonxml;
   90: use Apache::lontexconvert;
   91: use lib '/home/httpd/lib/perl/';
   92: use LONCAPA;
   93: use XML::LibXML;
   94: use Encode;
   95: use HTML::Entities;
   96: use HTML::LCParser();
   97: use Safe();
   98: 
   99: local $XML::LibXML::skipXMLDeclaration = 1;
  100: local $XML::LibXML::skipDTD = 1;
  101: local $XML::LibXML::setTagCompression = 1;
  102: 
  103: ##############################################
  104: ##############################################
  105: 
  106: =item %LONCAPA_ALLOWED_STANDARD_TAGS
  107: 
  108: This is a hash of all tags, both HTML and custom LON-CAPA tags that
  109: are allowed in non-authoring spaces.  Examples of this include
  110: course documents, bulletin boards, discussion posts, templated pages,
  111: etc.  In addition, in the event of rich text editing, the WYSIWYG
  112: editor needs to know how to display LON-CAPA custom tags as either
  113: inline-level (<span>) or block-level (<div>). Therefore, the hash is
  114: set up with uppercase tag names as keys ("H1"), and the corresponding
  115: entry an integer constant indicating that tag's role or purpose:
  116: 
  117: =over 4
  118: 
  119: =item 0 =
  120: 
  121: Tag is explictly not allowed.  Currently not used anywhere in this
  122: module, but reserved for the future in case certain tags would like
  123: to be explicitly blacklisted.
  124: 
  125: =item 1 =
  126: 
  127: Tag is allowed, and in cases where it is unclear, is rendered as an
  128: inline-level element.  Example: <algebra> should be rendered as an 
  129: inline element.
  130: 
  131: =item 2 =
  132: 
  133: Tag is allowed, and in cases where it is unclear, is rendered as a
  134: block-level element.  Example: <md> should be rendered as a block
  135: element.
  136: 
  137: =back
  138: 
  139: =back
  140: 
  141: =cut
  142: 
  143: ##############################################
  144: ##############################################
  145: 
  146: our %LONCAPA_ALLOWED_STANDARD_TAGS = (
  147:     # standard html header tags
  148:     H1 => 2, H2 => 2, H3 => 2, H4 => 2, H5 => 2, H6 => 2,
  149:     # basic inline formatting and phrases
  150:     B => 1, I => 1, U => 1, STRONG => 1, EM => 1, STRIKE => 1,
  151:     BIG => 1, SMALL => 1, INS => 1, DEL => 1, S => 1,
  152:     Q => 1, DFN => 1, CODE => 1, SAMP => 1, KBD => 1, VAR => 1,
  153:     SUB => 1, SUP => 1,
  154:     # linking and embedding
  155:     A => 1, IMG => 1, 
  156:     # block level tags
  157:     P => 2, DIV => 2, OL => 2, UL => 2, LI => 2, ADDRESS => 2,
  158:     BR => 2, HR => 2, BLOCKQUOTE => 2, PRE => 2, 
  159:     # table-related tags
  160:     TABLE => 2, CAPTION => 2, TBODY => 2, TR => 2, TD => 2,
  161:     TH => 2, 
  162:     # LON-CAPA custom tags
  163:     M => 1, MI => 1, MD => 2, ALGEBRA => 1,
  164:     CHEM => 1
  165: );
  166: 
  167: ##############################################
  168: ##############################################
  169: 
  170: =head1 PARSING LON-CAPA CUSTOM TAGS
  171: 
  172: This module maintains a hash %custom_tag_parsers, containing 
  173: lowercase tag names as keys and function references as entries.
  174: Convention used here names the actual parsing function whose
  175: reference is stored here to be of the name &parse_tagname_tag().
  176: These functions are called during the processing of outgoing 
  177: HTML output in the &process_outgoing_html() function.
  178: 
  179: Each of these functions is passed the following arguments:
  180: 
  181: =over 4
  182: 
  183: =item self
  184: 
  185: Reference to Apache::lonhtmlgateway object calling the function.
  186: 
  187: =item input
  188: 
  189: Textual context extracted between the <tag> and </tag> tags.
  190: Note that this text I<could> contain HTML entities.  Thus, for 
  191: functions that cannot handle entitized input, 
  192: &HTML::Entities::decode_entities() should be called on this data
  193: before further handing it off.
  194: 
  195: =back
  196: 
  197: Example hash entry:
  198: 
  199:     mi => \&parse_mi_tag,
  200: 
  201: =head2 Currently Supported Custom Tags
  202: 
  203: =over 4
  204: 
  205: =item <algebra>
  206: 
  207: Intended to convert and simplify simple algebraic functions into
  208: readable output.  Corrects cases such as double negatives or 
  209: eliminates coefficients of 1 where appropriate.  The actual
  210: handling of content contained in this tag takes place inside
  211: L<Apache::lontexconvert>, which in turn uses the AlgParser 
  212: module to actually process the input.
  213: 
  214: Usage:
  215:     <algebra>2*x+(-5)</algebra>
  216: 
  217: =item <chem>
  218: 
  219: Formatter for chemical equations, adding superscripts, subscripts,
  220: and appropriate arrow characters as appropriate.  This parser is
  221: wholly contained inside this module, but is a copy of a routine
  222: found in homework/default_homework.lcpm.
  223: 
  224: Usage:
  225:     <chem>CH3CO2H + H2O <=> CH3CO2- + H3O+</chem>
  226: 
  227: =back
  228: 
  229: =head3 Math Mode Tags
  230: 
  231: These tags are intended for LaTeX math mode input, in order to
  232: produce complex mathematical and scientific constructs, which
  233: normal HTML cannot produce.  The output is later rendered by
  234: a user-defined TeX engine in web target, or handled directly
  235: in the case of tex target.  The only difference between the tags
  236: below is determining the author's intent on how to appropriately
  237: render the contents within the tag - this intent is
  238: important in preserving the What You See Is What You Get philosophy
  239: of the rich text editor.
  240: 
  241: =over 4
  242: 
  243: =item <mi>
  244: 
  245: Inline math mode tag.  Content is surrounded by "$" characters and
  246: passed to the parser for the <m> tag.
  247: 
  248: I<New for LON-CAPA 3.0>.
  249: 
  250: =item <md>
  251: 
  252: Display block math mode tag.  Content is surrounded by "\[" and 
  253: "\]" characters and passed to the parser for the <m> tag.
  254: 
  255: I<New for LON-CAPA 3.0>.
  256: 
  257: =item <m>
  258: 
  259: Math mode tag.  Allows author to fully specify the display of their
  260: TeX input, and contain mixed inline-and-block content within a single
  261: tag.  
  262: 
  263: Due to tools such as the rich text editor needing to know whether a
  264: custom tag is block-level or inline-level on render, the use of this
  265: tag is discouraged starting with LON-CAPA 3.0 although it will continue
  266: to function.  Fully compatible with legacy LON-CAPA 2.x content.
  267: 
  268: =back
  269: 
  270: =cut
  271: 
  272: ##############################################
  273: ##############################################
  274: 
  275: my %custom_tag_parsers = (
  276:     mi => \&parse_mi_tag,
  277:     md => \&parse_md_tag,
  278:     m => \&parse_m_tag,
  279:     algebra => \&parse_algebra_tag,
  280:     chem => \&parse_chem_tag
  281: );
  282: 
  283: ##############################################
  284: ##############################################
  285: 
  286: =head1 CLASS OBJECT CONSTRUCTOR
  287: 
  288: =over 4
  289: 
  290: =item new
  291: 
  292:     $gateway = Apache::libhtmlgateway->new();
  293:     $gateway = Apache::libhtmlgateway->new($target);
  294: 
  295: Constructs and returns a new gateway object.  An optional argument
  296: allows one to specify the target of the output, defaults to 'web'.
  297: Behind the scenes, a single XML::LibXML parser object is created
  298: behind the scenes.  On destroy, this parser object is destroyed
  299: as well.
  300: 
  301: =back
  302: 
  303: =cut
  304: 
  305: ##############################################
  306: ##############################################
  307: 
  308: sub new {
  309:     my $invocant = shift;
  310:     my $class = ref($invocant) || $invocant;
  311:     my $target = shift;
  312:     # create a new parser instance for libxml
  313:     my $self = {
  314:         parser => XML::LibXML->new(),
  315:         target => ($target) ? $target : 'web'
  316:     };
  317:     # options for the libxml parser
  318:     $self->{parser}->recover(1);
  319:     $self->{parser}->recover_silently(1);
  320:     bless($self, $class);  # bless = pray that it works
  321:     return $self;
  322: }
  323: 
  324: sub DESTROY {
  325:     my $self = shift;
  326:     my $parser = $self->{parser};
  327:     undef $parser;  # destroy the parser instance
  328: }
  329: 
  330: ##############################################
  331: ##############################################
  332: 
  333: =head1 PUBLIC OBJECT METHODS
  334: 
  335: =over 4
  336: 
  337: =item process_html_to_xml
  338: 
  339:     $xml = $gateway->process_html_to_xml($html);
  340: 
  341: Takes presumably-malformed HTML, encodes ampersands characters 
  342: and passes the result to the Xml::LibXML parser, which creates
  343: a DOM tree in memory of the content.  This parse is as error-tolerant
  344: as can be set, and libxml attempts to recover from any errors as much
  345: as possible. This DOM tree is then taken and serialized,
  346: eliminating unbalanced and malformed tags along the way. This
  347: XML code (without any header tags) is then returned to the caller.
  348: 
  349: =cut
  350: 
  351: ##############################################
  352: ##############################################
  353: 
  354: sub process_html_to_xml {
  355:     my $self = shift;
  356:     my $input = shift;
  357:     my $parser = $self->{parser};
  358: 
  359:     if (length($input) < 1) { return ""; }
  360:     
  361:     # only encode ampersands -- brackets may be valid tags
  362:     my $encoded = &HTML::Entities::encode_entities($input, '&');
  363:     
  364:     # for the <chem> tag, we want the strings "<=>", "<-", "->" to be properly
  365:     # entitized so the parser doesn't destroy it
  366:     $encoded =~ s/(\<\s*chem\s*>.*)\<\=\>(.*\<\s*\/chem\s*>)/$1\&lt\;\&\#61\;\&gt\;$2/gi;
  367:     $encoded =~ s/(\<\s*chem\s*>.*)\-\>(.*\<\s*\/chem\s*>)/$1\-\&gt\;$2/gi;
  368:     $encoded =~ s/(\<\s*chem\s*>.*)\<\-(.*\<\s*\/chem\s*>)/$1\&lt\;\-$2/gi;
  369:     
  370:     # parse into libXML to tidy tags, we suppress any errors
  371:     # because otherwise the parser complains about non-HTML
  372:     # tags to STDERR and the Apache error logs
  373:     my $dom = $parser->parse_html_string($encoded,
  374:         {
  375:             suppress_errors => 1,
  376:             suppress_warnings => 1,
  377:             recover => 2
  378:         }
  379:     );
  380:     # the dom returns a full <html> structure, so just get
  381:     # all the child nodes of the <body> tag and put them together
  382:     my @body_nodes = $dom->findnodes('/html/body');
  383:     my @body_children = $body_nodes[0]->childNodes;
  384:     my $xml = "";
  385:     foreach my $child (@body_children) {
  386:         $xml .= $child->toString();
  387:     }
  388:     # entities passed into $input are in the form of '&amp;lt;'
  389:     # they are double entities
  390:     return $xml;    
  391: }
  392: 
  393: ##############################################
  394: ##############################################
  395: 
  396: =item process_xml_to_html
  397: 
  398:     $xhtml = $gateway->process_xml_to_html($xml);
  399: 
  400: Takes XML input, decodes ampersands characters 
  401: and passes the result then to the caller.
  402: 
  403: =cut
  404: 
  405: ##############################################
  406: ##############################################
  407: 
  408: sub process_xml_to_html {
  409:     my $self = shift;
  410:     my $input = shift;
  411:     # decode one level of entities (XML) such that the
  412:     # output is returned to the original level of entities
  413:     # $input "&lt;" --> $xml "&amp;lt;" --> "&lt;"
  414:     my $xhtml = &HTML::Entities::decode_entities($input);
  415:     # now we have valid XHTML that can be stored and parsed
  416:     return $xhtml;
  417: }
  418: 
  419: ##############################################
  420: ##############################################
  421: 
  422: =item process_incoming_html
  423: 
  424:     $xhtml = $gateway->process_incoming_html($html);
  425:     $xhtml = $gateway->process_incoming_html($html, $legacy);
  426: 
  427: Designed to be called for all raw HTML inputs from the client
  428: side before storing or rendering data.  Decodes UTF-8 data,
  429: trims leading and trailing "\n" and "<br />" tags.  Processes
  430: the result through the XML parser, converts this back to
  431: balanced well-formed XHTML, re-encodes the result as UTF-8,
  432: and returns the result to the caller.
  433: 
  434: =over 4
  435: 
  436: =item legacy
  437: 
  438:     $legacy = 0;
  439:     $legacy = 1; 
  440: 
  441: I<(optional)> If true, adds additional processing intended
  442: to emulate LON-CAPA 2.x parsing of the content.
  443: 
  444: =back
  445: 
  446: =cut
  447: 
  448: ##############################################
  449: ##############################################
  450: 
  451: sub process_incoming_html {
  452:     # this should be called by all HTML inputs before storing
  453:     # data --> for consistency's sake, call process_html_to_xml
  454:     # afterwards if you need to embed this in XML later on
  455:     my $self = shift;
  456:     my $input = shift;
  457:     my $legacy = shift;
  458:     
  459:     # no idea why i have to call this to get unicode characters
  460:     # working, but i do, so here it is.
  461:     $input = &Encode::decode_utf8($input);
  462:     
  463:     # trim leading and trailing whitespace and HTML breaks
  464:     chomp($input);
  465:     $input =~ s/\s+$//s;
  466:     $input =~ s/^\s+//s;
  467:     $input =~ s/\<br\s*\/*\>$//s;
  468:     my $no_p_input = (length($input) > 0 && $input !~ m/.*\<[\s]*p[\s]*\>.*/is);
  469:     my $xml = $self->process_html_to_xml($input);
  470:     if ($legacy && !&contains_block_level_tags($input)) {
  471:         # the xml returns content inside a <p> tag
  472:         # if there are no block tags... thus to preserve
  473:         # old behavior, we strip out that <p></p>
  474:         if ($no_p_input) {
  475:             $xml =~ s/^\<p\>(.*)\<\/p\>/$1/si;
  476:         }
  477:     }
  478:     my $xhtml = $self->process_xml_to_html($xml);
  479:     # see above unicode encoding comment
  480:     $xhtml = &Encode::encode_utf8($xhtml);
  481:     return $xhtml;
  482: }
  483: 
  484: ##############################################
  485: ##############################################
  486: 
  487: =item process_outgoing_html
  488: 
  489:     $html = $gateway->process_outgoing_html($xhtml);
  490:     $html = $gateway->process_outgoing_html($xhtml, $legacy);
  491: 
  492: Designed to be called for all HTML outputs to the client
  493: side before rendering data.  This entitizes all non-allowed
  494: tags, as was previously done in Apache::lonfeedback, and
  495: processes and converts all LON-CAPA supported custom tags (see
  496: above) to their respective output HTML.
  497: 
  498: =over 4
  499: 
  500: =item legacy
  501: 
  502:     $legacy = 0;
  503:     $legacy = 1; 
  504: 
  505: I<(optional)> If true, adds additional processing intended
  506: to emulate LON-CAPA 2.x parsing of the content.  This includes
  507: behavior to convert "\n" to "<br />" if there are no block-level
  508: tags detected in the input.  In addition, raw URLs are converted
  509: automatically to <a> links.
  510: 
  511: =back
  512: 
  513: =back
  514: 
  515: =cut
  516: 
  517: ##############################################
  518: ##############################################
  519: 
  520: sub process_outgoing_html {
  521:     # this should be called on all HTML outputs before displaying
  522:     # because it will filter out all non-HTML+LONCAPA tags.
  523:     # tags are not filtered at input stage for greater backwards
  524:     # compatibility.  note that this disregards course preference.
  525:     my $self = shift;
  526:     my $input = shift;
  527:     my $legacy = shift;
  528:     
  529:     my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
  530:     # entitize all tags that are not explicitly allowed
  531:     $input =~ s/\<(\/?\s*(\w+)[^\>\<]*)/
  532:         {($html{uc($2)}&&(length($1)<1000))?"\<$1":"\&lt;$1"}/ge;
  533:     $input =~ s/(\<?\s*(\w+)[^\<\>]*)\>/
  534:         {($html{uc($2)}&&(length($1)<1000))?"$1\>":"$1\&gt;"}/ge;
  535:     if ($legacy) {
  536:         unless (&contains_block_level_tags($self, $input)) {
  537:             $input = $self->legacy_newline_to_br($input); 
  538:         }
  539:         $input = $self->legacy_raw_href_to_link($input);
  540:     }
  541:     # at this point, we need to convert our own custom tags
  542:     # into the appropriate output
  543:     # see above for supported tags
  544:     my $output = "";
  545:     my $parser = HTML::LCParser->new(\$input);
  546:     while (my $token = $parser->get_token()) {
  547:     	if ($token->[0] eq 'T') {
  548:             if ($self->{target} ne 'tex') {
  549:     	        $output .= &Apache::lontexconvert::smiley($token->[1]);
  550:     	    } else {
  551:                 my $t = $token->[1];
  552:                 $t =~ s/([^\n\r\t &<>!\#%\(-;=?-~])/num_entity($1)/ge;
  553:                 $output .= $t;
  554:             }
  555:         } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
  556:     	    $output .= $token->[1];
  557:     	} elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
  558:     	    $output .= $token->[2];
  559:     	} elsif ($token->[0] eq 'S') {
  560:     	    my $tag = lc($token->[1]);
  561:     	    if (exists($custom_tag_parsers{$tag})) {
  562:     	        my $text = $parser->get_text();
  563:     	        $output .= $custom_tag_parsers{$tag}(
  564:     	            $self, $text, $self->{target});
  565:     	    } else {
  566:     	        $output .= $token->[4];
  567:     	    }
  568:     	}
  569:     }
  570:     return $output;
  571: }
  572: 
  573: ##############################################
  574: ##############################################
  575: 
  576: =head1 STATIC CLASS METHODS
  577: 
  578: The following are static class methods that can be called
  579: by any object.
  580: 
  581: =over 4
  582: 
  583: =item contains_block_level_tags
  584: 
  585:     $bool = Apache::lonhtmlgateway::contains_block_level_tags($input);
  586:     
  587: Uses a regular expression to find, in the input data, any tags 
  588: described in %LONCAPA_ALLOWED_STANDARD_TAGS as block-level.
  589: Returns 1 if true, 0 if false. 
  590: 
  591: =cut
  592: 
  593: ##############################################
  594: ##############################################
  595: 
  596: sub contains_block_level_tags {
  597:     my $class = shift;
  598:     my $input = shift;
  599:     my @block_level_tags = @{&get_block_level_tags($class)};
  600:     foreach my $tag (@block_level_tags) {
  601:         if ($input =~ m/\<\/?\s*$tag[^\>\<]*/gi) {
  602:             # if your input loves this regular expression
  603:             # as much as i do, then return true.
  604:             # it searches for either a <tag> or <tag />
  605:             return 1;
  606:         }
  607:     }
  608:     return 0;
  609: }
  610: 
  611: ##############################################
  612: ##############################################
  613: 
  614: =item get_block_level_tags
  615: 
  616:     @tags = Apache::lonhtmlgateway::get_block_level_tags();
  617:     
  618: Return an array with any tags described in 
  619: %LONCAPA_ALLOWED_STANDARD_TAGS as block-level. Note that these
  620: tags are returned in no particular order, and the tag names
  621: are returned in uppercase.
  622: 
  623: 
  624: =cut
  625: 
  626: ##############################################
  627: ##############################################
  628: 
  629: sub get_block_level_tags {
  630:     my $class = shift;
  631:     my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
  632:     my @block = [];
  633:     foreach my $tag (keys(%html)) {
  634:         if ($html{$tag} == 2) {
  635:             push(@block, $tag);
  636:         }
  637:     }
  638:     return \@block;
  639: }
  640: 
  641: sub num_entity {
  642:     sprintf "&#x%X;", ord($_[0]);
  643: }
  644: 
  645: ##############################################
  646: ##############################################
  647: 
  648: =head2 Legacy Functions
  649: 
  650: These functions are intended to process input in the same or
  651: a similar way to how it was processed in LON-CAPA 2.x.
  652: 
  653: =item legacy_newline_to_br
  654: 
  655: I<(formerly Apache::lonfeedback::newline_to_br)>
  656: 
  657:     $converted = Apache::lonhtmlgateway::legacy_newline_to_br($input);
  658:     
  659: Parse the input using HTML::LCParser, and in any text nodes
  660: which contain "\n" characters, replace those characters with
  661: an HTML "<br />" tag.
  662: 
  663: =cut
  664: 
  665: ##############################################
  666: ##############################################
  667: 
  668: sub legacy_newline_to_br {
  669:     my $class = shift;
  670:     my $input = shift;
  671:     my $output;
  672:     my $parser = HTML::LCParser->new(\$input);
  673:     while (my $token = $parser->get_token()) {
  674:     	if ($token->[0] eq 'T') {
  675:     	    my $text = $token->[1];
  676:     	    $text =~ s/\n/\<br \/\>/g;
  677:     	    $output .= $text;
  678:     	} elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
  679:     	    $output .= $token->[1];
  680:     	} elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
  681:     	    $output .= $token->[2];
  682:     	} elsif ($token->[0] eq 'S') {
  683:     	    $output .= $token->[4];
  684:     	}
  685:     }
  686:     return $output;
  687: }
  688: 
  689: ##############################################
  690: ##############################################
  691: 
  692: =item legacy_raw_href_to_link
  693: 
  694: I<(formerly Apache::lonhtmlcommon::raw_href_to_link)>
  695:     
  696:     $converted = Apache::lonhtmlgateway::legacy_raw_href_to_link($input);
  697:     
  698: Search for any links/URLs within the input text, and convert them
  699: to <a> tags whose content is embedded inside a <tt> tag.
  700: 
  701: =back
  702: 
  703: =cut
  704: 
  705: ##############################################
  706: ##############################################
  707: 
  708: sub legacy_raw_href_to_link {
  709:     my $class = shift;
  710:     my $input = shift;
  711:     $input =~ s/(https?\:\/\/[^\s\'\"\<]+)([\s\<]|$)/<a href="$1"><tt>$1<\/tt><\/a>$2/gi;
  712:     return $input;
  713: }
  714: 
  715: sub parse_algebra_tag {
  716:     my $self = shift;
  717:     my $input = shift;
  718:     # the <algebra> parser does NOT handle entities,
  719:     # unlike the general <m> parser; thus we run
  720:     # the content of this tag through HTML::Entities,
  721:     # decoding it first. we also just get the tex, and
  722:     # feed it through as if it were an <mi> tag.
  723:     $input = &HTML::Entities::decode($input);
  724:     my $algebra = 
  725:         &Apache::lontexconvert::algebra($input,'tex',undef,undef,undef,'tth');
  726:     return &parse_m_tag($self, $algebra);
  727: }
  728: 
  729: sub parse_mi_tag {
  730:     my $self = shift;
  731:     my $input = shift;
  732:     return &parse_m_tag($self, '\ensuremath{'.$input.'}');
  733: }
  734: 
  735: sub parse_md_tag {
  736:     my $self = shift;
  737:     my $input = shift;
  738:     return &parse_m_tag($self, '\['.$input.'\]');
  739: }
  740: 
  741: sub parse_m_tag {
  742:     my $self = shift;
  743:     my $input = shift;
  744:     if ($self->{target} ne 'tex') {
  745:         return &Apache::lontexconvert::to_convert($input, $self->{target});
  746:     } else {
  747:         return '<m>'.$input.'</m>';
  748:     }
  749: }
  750: 
  751: sub parse_chem_tag {
  752:     my $self = shift;
  753:     my $input = shift;
  754:     my $target = $self->{target};
  755:     # as with the <algebra> tag, some portions of the
  756:     # <chem> input may be coming in encoded, especially
  757:     # arrows -- so decode it in HTML::Entities
  758:     $input = &HTML::Entities::decode($input);
  759:     my @tokens = split(/(\s\+|\->|<=>|<\-|\.)/,$input);
  760:     my $formula = '';
  761:     foreach my $token (@tokens) {
  762:     	if ($token eq '->' ) {
  763:     	    if ($target eq 'web') {
  764:     	        $formula .= '&#8594; ';
  765:     	    } else {
  766:     	        $formula .= '<m>\ensuremath{\rightarrow}</m> ';
  767:     	    }
  768:     	    next;
  769:     	}
  770:     	if ($token eq '<-' ) {
  771:     	    if ($target eq 'web') {
  772:     	        $formula .= '&#8592; ';
  773:     	    } else {
  774:     	        $formula .= '<m>\ensuremath{\leftarrow}</m> ';
  775:     	    }
  776:     	    next;
  777:     	}  
  778:     	if ($token eq '<=>') {
  779:     	    if ($target eq 'web') {
  780:     		$formula .= '&#8652; ';
  781:     	    } else {
  782:     		$formula .= '<m>\ensuremath{\rightleftharpoons}</m> ';
  783:     	    }
  784:     	    next;
  785:     	}
  786:     	if ($token eq '.') {
  787:     	  $formula =~ s/(\&nbsp\;| )$//;
  788:     	  $formula .= '&middot;';
  789:     	  next;
  790:     	}
  791:     	$token =~ /^\s*([\d|\/]*(?:&frac\d\d)?)(.*)/;
  792:             $formula .= $1 if ($1 ne '1');  # stoichiometric coefficient
  793:     	my $molecule = $2;
  794:     	# subscripts
  795:     	$molecule =~ s|(?<=[a-zA-Z\)\]\s])(\d+)|<sub>$1</sub>|g;
  796:     	# superscripts
  797:     	$molecule =~ s|\^(\d*[+\-]*)|<sup>$1</sup>|g;
  798:     	# strip whitespace
  799:     	$molecule =~ s/\s*//g;
  800:     	# forced space
  801:     	$molecule =~ s/_/ /g;
  802:     	$molecule =~ s/-/&minus;/g;
  803:     	$formula .= $molecule.'&nbsp;';
  804:     }
  805:     # get rid of trailing space
  806:     $formula =~ s/(\&nbsp\;| )$//;
  807:     return $formula;
  808: }
  809: 
  810: ##############################################
  811: ##############################################
  812: 
  813: =head1 AUTHORS
  814: 
  815: Phil Fazio
  816: 
  817: =head1 VERSION
  818: 
  819: $Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $
  820: 
  821: =cut
  822: 
  823: ##############################################
  824: ##############################################

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>