loncom/interface/lonhtmlgateway.pm - view

File: [LON-CAPA] / loncom / interface / lonhtmlgateway.pm
Revision 1.5: download - view: text, annotated - select for diffs
Mon May 24 23:47:22 2010 UTC (13 years, 11 months ago) by raeburn
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_X, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, version_2_11_2_uiuc, version_2_11_2_msu, version_2_11_2_educog, version_2_11_2, version_2_11_1, version_2_11_0_RC3, version_2_11_0_RC2, version_2_11_0, language_hyphenation_merge, language_hyphenation, HEAD, BZ4492-merge, BZ4492-feature_horizontal_radioresponse

- 'tth' is sixth arg in lontexconvert::algebra().

1: # The LearningOnline Network with CAPA 2: # gateway for html input/output to be properly parsed and handled 3: # 4: # $Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $ 5: # 6: # Copyright Michigan State University Board of Trustees 7: # 8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA). 9: # 10: # LON-CAPA is free software; you can redistribute it and/or modify 11: # it under the terms of the GNU General Public License as published by 12: # the Free Software Foundation; either version 2 of the License, or 13: # (at your option) any later version. 14: # 15: # LON-CAPA is distributed in the hope that it will be useful, 16: # but WITHOUT ANY WARRANTY; without even the implied warranty of 17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18: # GNU General Public License for more details. 19: # 20: # You should have received a copy of the GNU General Public License 21: # along with LON-CAPA; if not, write to the Free Software 22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23: # 24: # /home/httpd/html/adm/gpl.txt 25: # 26: # http://www.lon-capa.org/ 27: # 28: ###################################################################### 29: ###################################################################### 30: 31: =pod 32: 33: =head1 NAME 34: 35: Apache::lonhtmlgateway - properly parse and handle HTML input and output 36: 37: =head1 SYNOPSIS 38: 39: This is part of the LearningOnline Network with CAPA project 40: described at http://www.lon-capa.org. 41: 42: =head1 INTRODUCTION 43: 44: lonhtmlgateway is an object-oriented module used to parse and correct 45: malformed HTML input from the client, as well as to perform processing 46: of custom LON-CAPA HTML output before it is sent along to the end-user. 47: It replaces a number of subroutines in various modules, and adds new 48: code to tidy and process malformed HTML using XML::LibXML. 49: 50: This module is intended to be used for all non-authoring perspectives 51: in the system. 52: 53: New to LON-CAPA version 3.0. 54: 55: =head2 Example Usage 56: 57: Below is intended code to be invoked and called for use outside 58: of this module: 59: 60: $gateway = Apache::lonhtmlgateway->new(); 61: $gateway = Apache::lonhtmlgateway->new($target); 62: 63: $xhtml = $gateway->process_incoming_html($html); 64: $xhtml = $gateway->process_incoming_html($html, $legacy); 65: 66: $xml = $gateway->process_html_to_xml($html); 67: $xhtml = $gateway->process_xml_to_html($xml); 68: 69: $bool = Apache::lonhtmlgateway->contains_block_level_tags($input); 70: 71: =head1 GLOBAL VARIABLES 72: 73: =over 4 74: 75: =cut 76: 77: ###################################################################### 78: ###################################################################### 79: 80: package Apache::lonhtmlgateway; 81: 82: use strict; 83: use utf8; 84: use Time::Local; 85: use Time::HiRes; 86: use Apache::lonlocal; 87: use Apache::lonnet; 88: use Apache::lonhtmlcommon; 89: use Apache::lonxml; 90: use Apache::lontexconvert; 91: use lib '/home/httpd/lib/perl/'; 92: use LONCAPA; 93: use XML::LibXML; 94: use Encode; 95: use HTML::Entities; 96: use HTML::LCParser(); 97: use Safe(); 98: 99: local $XML::LibXML::skipXMLDeclaration = 1; 100: local $XML::LibXML::skipDTD = 1; 101: local $XML::LibXML::setTagCompression = 1; 102: 103: ############################################## 104: ############################################## 105: 106: =item %LONCAPA_ALLOWED_STANDARD_TAGS 107: 108: This is a hash of all tags, both HTML and custom LON-CAPA tags that 109: are allowed in non-authoring spaces. Examples of this include 110: course documents, bulletin boards, discussion posts, templated pages, 111: etc. In addition, in the event of rich text editing, the WYSIWYG 112: editor needs to know how to display LON-CAPA custom tags as either 113: inline-level () or block-level (<div>). Therefore, the hash is 114: set up with uppercase tag names as keys ("H1"), and the corresponding 115: entry an integer constant indicating that tag's role or purpose: 116: 117: =over 4 118: 119: =item 0 = 120: 121: Tag is explictly not allowed. Currently not used anywhere in this 122: module, but reserved for the future in case certain tags would like 123: to be explicitly blacklisted. 124: 125: =item 1 = 126: 127: Tag is allowed, and in cases where it is unclear, is rendered as an 128: inline-level element. Example: <algebra> should be rendered as an 129: inline element. 130: 131: =item 2 = 132: 133: Tag is allowed, and in cases where it is unclear, is rendered as a 134: block-level element. Example: <md> should be rendered as a block 135: element. 136: 137: =back 138: 139: =back 140: 141: =cut 142: 143: ############################################## 144: ############################################## 145: 146: our %LONCAPA_ALLOWED_STANDARD_TAGS = ( 147: # standard html header tags 148: H1 => 2, H2 => 2, H3 => 2, H4 => 2, H5 => 2, H6 => 2, 149: # basic inline formatting and phrases 150: B => 1, I => 1, U => 1, STRONG => 1, EM => 1, STRIKE => 1, 151: BIG => 1, SMALL => 1, INS => 1, DEL => 1, S => 1, 152: Q => 1, DFN => 1, CODE => 1, SAMP => 1, KBD => 1, VAR => 1, 153: SUB => 1, SUP => 1, 154: # linking and embedding 155: A => 1, IMG => 1, 156: # block level tags 157: P => 2, DIV => 2, OL => 2, UL => 2, LI => 2, ADDRESS => 2, 158: BR => 2, HR => 2, BLOCKQUOTE => 2, PRE => 2, 159: # table-related tags 160: TABLE => 2, CAPTION => 2, TBODY => 2, TR => 2, TD => 2, 161: TH => 2, 162: # LON-CAPA custom tags 163: M => 1, MI => 1, MD => 2, ALGEBRA => 1, 164: CHEM => 1 165: ); 166: 167: ############################################## 168: ############################################## 169: 170: =head1 PARSING LON-CAPA CUSTOM TAGS 171: 172: This module maintains a hash %custom_tag_parsers, containing 173: lowercase tag names as keys and function references as entries. 174: Convention used here names the actual parsing function whose 175: reference is stored here to be of the name &parse_tagname_tag(). 176: These functions are called during the processing of outgoing 177: HTML output in the &process_outgoing_html() function. 178: 179: Each of these functions is passed the following arguments: 180: 181: =over 4 182: 183: =item self 184: 185: Reference to Apache::lonhtmlgateway object calling the function. 186: 187: =item input 188: 189: Textual context extracted between the <tag> and </tag> tags. 190: Note that this text I<could> contain HTML entities. Thus, for 191: functions that cannot handle entitized input, 192: &HTML::Entities::decode_entities() should be called on this data 193: before further handing it off. 194: 195: =back 196: 197: Example hash entry: 198: 199: mi => \&parse_mi_tag, 200: 201: =head2 Currently Supported Custom Tags 202: 203: =over 4 204: 205: =item <algebra> 206: 207: Intended to convert and simplify simple algebraic functions into 208: readable output. Corrects cases such as double negatives or 209: eliminates coefficients of 1 where appropriate. The actual 210: handling of content contained in this tag takes place inside 211: L<Apache::lontexconvert>, which in turn uses the AlgParser 212: module to actually process the input. 213: 214: Usage: 215: <algebra>2*x+(-5)</algebra> 216: 217: =item <chem> 218: 219: Formatter for chemical equations, adding superscripts, subscripts, 220: and appropriate arrow characters as appropriate. This parser is 221: wholly contained inside this module, but is a copy of a routine 222: found in homework/default_homework.lcpm. 223: 224: Usage: 225: <chem>CH3CO2H + H2O <=> CH3CO2- + H3O+</chem> 226: 227: =back 228: 229: =head3 Math Mode Tags 230: 231: These tags are intended for LaTeX math mode input, in order to 232: produce complex mathematical and scientific constructs, which 233: normal HTML cannot produce. The output is later rendered by 234: a user-defined TeX engine in web target, or handled directly 235: in the case of tex target. The only difference between the tags 236: below is determining the author's intent on how to appropriately 237: render the contents within the tag - this intent is 238: important in preserving the What You See Is What You Get philosophy 239: of the rich text editor. 240: 241: =over 4 242: 243: =item <mi> 244: 245: Inline math mode tag. Content is surrounded by "$" characters and 246: passed to the parser for the <m> tag. 247: 248: I<New for LON-CAPA 3.0>. 249: 250: =item <md> 251: 252: Display block math mode tag. Content is surrounded by "\[" and 253: "\]" characters and passed to the parser for the <m> tag. 254: 255: I<New for LON-CAPA 3.0>. 256: 257: =item <m> 258: 259: Math mode tag. Allows author to fully specify the display of their 260: TeX input, and contain mixed inline-and-block content within a single 261: tag. 262: 263: Due to tools such as the rich text editor needing to know whether a 264: custom tag is block-level or inline-level on render, the use of this 265: tag is discouraged starting with LON-CAPA 3.0 although it will continue 266: to function. Fully compatible with legacy LON-CAPA 2.x content. 267: 268: =back 269: 270: =cut 271: 272: ############################################## 273: ############################################## 274: 275: my %custom_tag_parsers = ( 276: mi => \&parse_mi_tag, 277: md => \&parse_md_tag, 278: m => \&parse_m_tag, 279: algebra => \&parse_algebra_tag, 280: chem => \&parse_chem_tag 281: ); 282: 283: ############################################## 284: ############################################## 285: 286: =head1 CLASS OBJECT CONSTRUCTOR 287: 288: =over 4 289: 290: =item new 291: 292: $gateway = Apache::libhtmlgateway->new(); 293: $gateway = Apache::libhtmlgateway->new($target); 294: 295: Constructs and returns a new gateway object. An optional argument 296: allows one to specify the target of the output, defaults to 'web'. 297: Behind the scenes, a single XML::LibXML parser object is created 298: behind the scenes. On destroy, this parser object is destroyed 299: as well. 300: 301: =back 302: 303: =cut 304: 305: ############################################## 306: ############################################## 307: 308: sub new { 309: my $invocant = shift; 310: my $class = ref($invocant) || $invocant; 311: my $target = shift; 312: # create a new parser instance for libxml 313: my $self = { 314: parser => XML::LibXML->new(), 315: target => ($target) ? $target : 'web' 316: }; 317: # options for the libxml parser 318: $self->{parser}->recover(1); 319: $self->{parser}->recover_silently(1); 320: bless($self, $class); # bless = pray that it works 321: return $self; 322: } 323: 324: sub DESTROY { 325: my $self = shift; 326: my $parser = $self->{parser}; 327: undef $parser; # destroy the parser instance 328: } 329: 330: ############################################## 331: ############################################## 332: 333: =head1 PUBLIC OBJECT METHODS 334: 335: =over 4 336: 337: =item process_html_to_xml 338: 339: $xml = $gateway->process_html_to_xml($html); 340: 341: Takes presumably-malformed HTML, encodes ampersands characters 342: and passes the result to the Xml::LibXML parser, which creates 343: a DOM tree in memory of the content. This parse is as error-tolerant 344: as can be set, and libxml attempts to recover from any errors as much 345: as possible. This DOM tree is then taken and serialized, 346: eliminating unbalanced and malformed tags along the way. This 347: XML code (without any header tags) is then returned to the caller. 348: 349: =cut 350: 351: ############################################## 352: ############################################## 353: 354: sub process_html_to_xml { 355: my $self = shift; 356: my $input = shift; 357: my $parser = $self->{parser}; 358: 359: if (length($input) < 1) { return ""; } 360: 361: # only encode ampersands -- brackets may be valid tags 362: my $encoded = &HTML::Entities::encode_entities($input, '&'); 363: 364: # for the <chem> tag, we want the strings "<=>", "<-", "->" to be properly 365: # entitized so the parser doesn't destroy it 366: $encoded =~ s/(\<\s*chem\s*>.*)\<\=\>(.*\<\s*\/chem\s*>)/$1\&lt\;\&\#61\;\&gt\;$2/gi; 367: $encoded =~ s/(\<\s*chem\s*>.*)\-\>(.*\<\s*\/chem\s*>)/$1\-\&gt\;$2/gi; 368: $encoded =~ s/(\<\s*chem\s*>.*)\<\-(.*\<\s*\/chem\s*>)/$1\&lt\;\-$2/gi; 369: 370: # parse into libXML to tidy tags, we suppress any errors 371: # because otherwise the parser complains about non-HTML 372: # tags to STDERR and the Apache error logs 373: my $dom = $parser->parse_html_string($encoded, 374: { 375: suppress_errors => 1, 376: suppress_warnings => 1, 377: recover => 2 378: } 379: ); 380: # the dom returns a full <html> structure, so just get 381: # all the child nodes of the <body> tag and put them together 382: my @body_nodes = $dom->findnodes('/html/body'); 383: my @body_children = $body_nodes[0]->childNodes; 384: my $xml = ""; 385: foreach my $child (@body_children) { 386: $xml .= $child->toString(); 387: } 388: # entities passed into $input are in the form of '&lt;' 389: # they are double entities 390: return $xml; 391: } 392: 393: ############################################## 394: ############################################## 395: 396: =item process_xml_to_html 397: 398: $xhtml = $gateway->process_xml_to_html($xml); 399: 400: Takes XML input, decodes ampersands characters 401: and passes the result then to the caller. 402: 403: =cut 404: 405: ############################################## 406: ############################################## 407: 408: sub process_xml_to_html { 409: my $self = shift; 410: my $input = shift; 411: # decode one level of entities (XML) such that the 412: # output is returned to the original level of entities 413: # $input "<" --> $xml "&lt;" --> "<" 414: my $xhtml = &HTML::Entities::decode_entities($input); 415: # now we have valid XHTML that can be stored and parsed 416: return $xhtml; 417: } 418: 419: ############################################## 420: ############################################## 421: 422: =item process_incoming_html 423: 424: $xhtml = $gateway->process_incoming_html($html); 425: $xhtml = $gateway->process_incoming_html($html, $legacy); 426: 427: Designed to be called for all raw HTML inputs from the client 428: side before storing or rendering data. Decodes UTF-8 data, 429: trims leading and trailing "\n" and " " tags. Processes 430: the result through the XML parser, converts this back to 431: balanced well-formed XHTML, re-encodes the result as UTF-8, 432: and returns the result to the caller. 433: 434: =over 4 435: 436: =item legacy 437: 438: $legacy = 0; 439: $legacy = 1; 440: 441: I<(optional)> If true, adds additional processing intended 442: to emulate LON-CAPA 2.x parsing of the content. 443: 444: =back 445: 446: =cut 447: 448: ############################################## 449: ############################################## 450: 451: sub process_incoming_html { 452: # this should be called by all HTML inputs before storing 453: # data --> for consistency's sake, call process_html_to_xml 454: # afterwards if you need to embed this in XML later on 455: my $self = shift; 456: my $input = shift; 457: my $legacy = shift; 458: 459: # no idea why i have to call this to get unicode characters 460: # working, but i do, so here it is. 461: $input = &Encode::decode_utf8($input); 462: 463: # trim leading and trailing whitespace and HTML breaks 464: chomp($input); 465: $input =~ s/\s+$//s; 466: $input =~ s/^\s+//s; 467: $input =~ s/\<br\s*\/*\>$//s; 468: my $no_p_input = (length($input) > 0 && $input !~ m/.*\<[\s]*p[\s]*\>.*/is); 469: my $xml = $self->process_html_to_xml($input); 470: if ($legacy && !&contains_block_level_tags($input)) { 471: # the xml returns content inside a tag 472: # if there are no block tags... thus to preserve 473: # old behavior, we strip out that 474: if ($no_p_input) { 475: $xml =~ s/^\<p\>(.*)\<\/p\>/$1/si; 476: } 477: } 478: my $xhtml = $self->process_xml_to_html($xml); 479: # see above unicode encoding comment 480: $xhtml = &Encode::encode_utf8($xhtml); 481: return $xhtml; 482: } 483: 484: ############################################## 485: ############################################## 486: 487: =item process_outgoing_html 488: 489: $html = $gateway->process_outgoing_html($xhtml); 490: $html = $gateway->process_outgoing_html($xhtml, $legacy); 491: 492: Designed to be called for all HTML outputs to the client 493: side before rendering data. This entitizes all non-allowed 494: tags, as was previously done in Apache::lonfeedback, and 495: processes and converts all LON-CAPA supported custom tags (see 496: above) to their respective output HTML. 497: 498: =over 4 499: 500: =item legacy 501: 502: $legacy = 0; 503: $legacy = 1; 504: 505: I<(optional)> If true, adds additional processing intended 506: to emulate LON-CAPA 2.x parsing of the content. This includes 507: behavior to convert "\n" to " " if there are no block-level 508: tags detected in the input. In addition, raw URLs are converted 509: automatically to <a> links. 510: 511: =back 512: 513: =back 514: 515: =cut 516: 517: ############################################## 518: ############################################## 519: 520: sub process_outgoing_html { 521: # this should be called on all HTML outputs before displaying 522: # because it will filter out all non-HTML+LONCAPA tags. 523: # tags are not filtered at input stage for greater backwards 524: # compatibility. note that this disregards course preference. 525: my $self = shift; 526: my $input = shift; 527: my $legacy = shift; 528: 529: my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS; 530: # entitize all tags that are not explicitly allowed 531: $input =~ s/\<(\/?\s*(\w+)[^\>\<]*)/ 532: {($html{uc($2)}&&(length($1)<1000))?"\<$1":"\<$1"}/ge; 533: $input =~ s/(\<?\s*(\w+)[^\<\>]*)\>/ 534: {($html{uc($2)}&&(length($1)<1000))?"$1\>":"$1\>"}/ge; 535: if ($legacy) { 536: unless (&contains_block_level_tags($self, $input)) { 537: $input = $self->legacy_newline_to_br($input); 538: } 539: $input = $self->legacy_raw_href_to_link($input); 540: } 541: # at this point, we need to convert our own custom tags 542: # into the appropriate output 543: # see above for supported tags 544: my $output = ""; 545: my $parser = HTML::LCParser->new(\$input); 546: while (my $token = $parser->get_token()) { 547: if ($token->[0] eq 'T') { 548: if ($self->{target} ne 'tex') { 549: $output .= &Apache::lontexconvert::smiley($token->[1]); 550: } else { 551: my $t = $token->[1]; 552: $t =~ s/([^\n\r\t &<>!\#%\(-;=?-~])/num_entity($1)/ge; 553: $output .= $t; 554: } 555: } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') { 556: $output .= $token->[1]; 557: } elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') { 558: $output .= $token->[2]; 559: } elsif ($token->[0] eq 'S') { 560: my $tag = lc($token->[1]); 561: if (exists($custom_tag_parsers{$tag})) { 562: my $text = $parser->get_text(); 563: $output .= $custom_tag_parsers{$tag}( 564: $self, $text, $self->{target}); 565: } else { 566: $output .= $token->[4]; 567: } 568: } 569: } 570: return $output; 571: } 572: 573: ############################################## 574: ############################################## 575: 576: =head1 STATIC CLASS METHODS 577: 578: The following are static class methods that can be called 579: by any object. 580: 581: =over 4 582: 583: =item contains_block_level_tags 584: 585: $bool = Apache::lonhtmlgateway::contains_block_level_tags($input); 586: 587: Uses a regular expression to find, in the input data, any tags 588: described in %LONCAPA_ALLOWED_STANDARD_TAGS as block-level. 589: Returns 1 if true, 0 if false. 590: 591: =cut 592: 593: ############################################## 594: ############################################## 595: 596: sub contains_block_level_tags { 597: my $class = shift; 598: my $input = shift; 599: my @block_level_tags = @{&get_block_level_tags($class)}; 600: foreach my $tag (@block_level_tags) { 601: if ($input =~ m/\<\/?\s*$tag[^\>\<]*/gi) { 602: # if your input loves this regular expression 603: # as much as i do, then return true. 604: # it searches for either a <tag> or <tag /> 605: return 1; 606: } 607: } 608: return 0; 609: } 610: 611: ############################################## 612: ############################################## 613: 614: =item get_block_level_tags 615: 616: @tags = Apache::lonhtmlgateway::get_block_level_tags(); 617: 618: Return an array with any tags described in 619: %LONCAPA_ALLOWED_STANDARD_TAGS as block-level. Note that these 620: tags are returned in no particular order, and the tag names 621: are returned in uppercase. 622: 623: 624: =cut 625: 626: ############################################## 627: ############################################## 628: 629: sub get_block_level_tags { 630: my $class = shift; 631: my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS; 632: my @block = []; 633: foreach my $tag (keys(%html)) { 634: if ($html{$tag} == 2) { 635: push(@block, $tag); 636: } 637: } 638: return \@block; 639: } 640: 641: sub num_entity { 642: sprintf "&#x%X;", ord($_[0]); 643: } 644: 645: ############################################## 646: ############################################## 647: 648: =head2 Legacy Functions 649: 650: These functions are intended to process input in the same or 651: a similar way to how it was processed in LON-CAPA 2.x. 652: 653: =item legacy_newline_to_br 654: 655: I<(formerly Apache::lonfeedback::newline_to_br)> 656: 657: $converted = Apache::lonhtmlgateway::legacy_newline_to_br($input); 658: 659: Parse the input using HTML::LCParser, and in any text nodes 660: which contain "\n" characters, replace those characters with 661: an HTML " " tag. 662: 663: =cut 664: 665: ############################################## 666: ############################################## 667: 668: sub legacy_newline_to_br { 669: my $class = shift; 670: my $input = shift; 671: my $output; 672: my $parser = HTML::LCParser->new(\$input); 673: while (my $token = $parser->get_token()) { 674: if ($token->[0] eq 'T') { 675: my $text = $token->[1]; 676: $text =~ s/\n/\ /g; 677: $output .= $text; 678: } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') { 679: $output .= $token->[1]; 680: } elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') { 681: $output .= $token->[2]; 682: } elsif ($token->[0] eq 'S') { 683: $output .= $token->[4]; 684: } 685: } 686: return $output; 687: } 688: 689: ############################################## 690: ############################################## 691: 692: =item legacy_raw_href_to_link 693: 694: I<(formerly Apache::lonhtmlcommon::raw_href_to_link)> 695: 696: $converted = Apache::lonhtmlgateway::legacy_raw_href_to_link($input); 697: 698: Search for any links/URLs within the input text, and convert them 699: to <a> tags whose content is embedded inside a <tt> tag. 700: 701: =back 702: 703: =cut 704: 705: ############################################## 706: ############################################## 707: 708: sub legacy_raw_href_to_link { 709: my $class = shift; 710: my $input = shift; 711: $input =~ s/(https?\:\/\/[^\s\'\"\<]+)([\s\<]|$)/<a href="$1"><tt>$1<\/tt><\/a>$2/gi; 712: return $input; 713: } 714: 715: sub parse_algebra_tag { 716: my $self = shift; 717: my $input = shift; 718: # the <algebra> parser does NOT handle entities, 719: # unlike the general <m> parser; thus we run 720: # the content of this tag through HTML::Entities, 721: # decoding it first. we also just get the tex, and 722: # feed it through as if it were an <mi> tag. 723: $input = &HTML::Entities::decode($input); 724: my $algebra = 725: &Apache::lontexconvert::algebra($input,'tex',undef,undef,undef,'tth'); 726: return &parse_m_tag($self, $algebra); 727: } 728: 729: sub parse_mi_tag { 730: my $self = shift; 731: my $input = shift; 732: return &parse_m_tag($self, '\ensuremath{'.$input.'}'); 733: } 734: 735: sub parse_md_tag { 736: my $self = shift; 737: my $input = shift; 738: return &parse_m_tag($self, '\['.$input.'\]'); 739: } 740: 741: sub parse_m_tag { 742: my $self = shift; 743: my $input = shift; 744: if ($self->{target} ne 'tex') { 745: return &Apache::lontexconvert::to_convert($input, $self->{target}); 746: } else { 747: return '<m>'.$input.'</m>'; 748: } 749: } 750: 751: sub parse_chem_tag { 752: my $self = shift; 753: my $input = shift; 754: my $target = $self->{target}; 755: # as with the <algebra> tag, some portions of the 756: # <chem> input may be coming in encoded, especially 757: # arrows -- so decode it in HTML::Entities 758: $input = &HTML::Entities::decode($input); 759: my @tokens = split(/(\s\+|\->|<=>|<\-|\.)/,$input); 760: my $formula = ''; 761: foreach my $token (@tokens) { 762: if ($token eq '->' ) { 763: if ($target eq 'web') { 764: $formula .= '→ '; 765: } else { 766: $formula .= '<m>\ensuremath{\rightarrow}</m> '; 767: } 768: next; 769: } 770: if ($token eq '<-' ) { 771: if ($target eq 'web') { 772: $formula .= '← '; 773: } else { 774: $formula .= '<m>\ensuremath{\leftarrow}</m> '; 775: } 776: next; 777: } 778: if ($token eq '<=>') { 779: if ($target eq 'web') { 780: $formula .= '⇌ '; 781: } else { 782: $formula .= '<m>\ensuremath{\rightleftharpoons}</m> '; 783: } 784: next; 785: } 786: if ($token eq '.') { 787: $formula =~ s/(\&nbsp\;| )$//; 788: $formula .= '·'; 789: next; 790: } 791: $token =~ /^\s*([\d|\/]*(?:&frac\d\d)?)(.*)/; 792: $formula .= $1 if ($1 ne '1'); # stoichiometric coefficient 793: my $molecule = $2; 794: # subscripts 795: $molecule =~ s|(?<=[a-zA-Z\)\]\s])(\d+)|$1|g; 796: # superscripts 797: $molecule =~ s|\^(\d*[+\-]*)|$1|g; 798: # strip whitespace 799: $molecule =~ s/\s*//g; 800: # forced space 801: $molecule =~ s/_/ /g; 802: $molecule =~ s/-/−/g; 803: $formula .= $molecule.' '; 804: } 805: # get rid of trailing space 806: $formula =~ s/(\&nbsp\;| )$//; 807: return $formula; 808: } 809: 810: ############################################## 811: ############################################## 812: 813: =head1 AUTHORS 814: 815: Phil Fazio 816: 817: =head1 VERSION 818: 819: $Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $ 820: 821: =cut 822: 823: ############################################## 824: ##############################################