--- loncom/interface/entities.pm 2008/04/28 10:33:35 1.9 +++ loncom/interface/entities.pm 2009/07/01 11:15:08 1.18.6.1 @@ -1,7 +1,7 @@ # The LearningOnline Network # entity -> tex. # -# +# $Id: entities.pm,v 1.18.6.1 2009/07/01 11:15:08 foxr Exp $ # # Copyright Michigan State University Board of Trustees # @@ -25,47 +25,90 @@ # http://www.lon-capa.org/ # # + package Apache::entities; use strict; -# -# This file contains a table driven entity-->latex converter. -# -# Assumptions: -# The number of entities in a resource is small compared with the -# number of possible entities that might be translated. -# Therefore the strategy is to match a general entity pattern -# &.+; over and over, pull out the match look it up in an entity -> tex hash -# and do the replacement. -# -# In order to simplify the hash, the following reductions are done: -# &#d+; have the &# and ; stripped and is converted to an int. -# &#.+; have the &#x and ; stripped and is converted to an int as a hex -# value. -# All others have the & and ; stripped. +=pod -# The hash: Add new conversions here; leave off the leading & and the trailing ; -# all numeric entities need only appear as their decimal versions -# (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. -# -# This entity table is mercilessly cribbed from the HTML pocket reference -# table starting at pg 82. In most cases the LaTeX equivalent codes come from -# the original massive regular expression replacements originally by -# A. Sakharuk in lonprintout.pm -# -# I also want to acknowledge -# ISO Character entities and their LaTeX equivalents by -# Vidar Bronken Gundersen, and Rune Mathisen -# http://www.bitjungle.com/isoent-ref.pdf -# +=head1 TABLES ASCII code page -# Note numerical entities are essentially unicode character codes. -# -package Apache::entities; +=over -my %entities = ( +=item (7-13) + + Translation to empty strings + +=item (32-126) + + Translations to simple characters + +=item (130-140) + + Controls and Latin-1 supplement. Note that some entities that have + visible effect are not printing unicode characters. Specifically + ‚-  + +=item (145-156) + + There's a gap here in my entity table + +=item (159-255) + + Another short gap + +=item (295) + + hbar entity number comes from the unicode character: + see e.g. http://www.unicode.org/charts/PDF/U0100.pdf + ISO also documents a 'planck' entity. + +=item (338-376) + + Latin extended-A HTML 4.01 entities + +=item (402) + + Latin extended B HTML 4.01 entities + +=item (710 & 732) + + Spacing modifier letters + +=item (913-937) + + Greek uppercase (skipss 930) + +=item (945-982) + + Greek lowercase - # ---- ASCII code page: ---------------- +=item (8194-8364) + + The general punctuation set + +=item (8472-8501) + + Letter like symbols + +=item (8592-8669) + + Arrows and then some (harpoons from Hon Kie). + +=item (8704-8734) + + Mathematical operators. + +=item (8735-9830) + + The items below require the isoent latex package which I can't find at least for FC5. + Temporarily commented out. + +=back + +=cut + +my %entities = ( # Translation to empty strings: @@ -176,9 +219,7 @@ my %entities = ( 125 => '\}', 126 => '\~', - # Controls and Latin-1 supplement. Note that some entities that have - # visible effect are not printing unicode characters. Specifically - # ‚-  + # Controls and Latin-1 supplement. 130 => ',', 131 => '\ensuremath{f}', @@ -192,7 +233,7 @@ my %entities = ( 139 => '\ensuremath{<}', 140 => '{\OE}', - # There's a gap here in my entity table + # There's a gap here in my entity table 145 => '`', 146 => '\'', @@ -206,8 +247,8 @@ my %entities = ( 154 => '\v{s}', 155 => '\ensuremath{>}', 156 => '\oe ', - - # Another short gap: + + # Another short gap: 159 => '\"Y', 160 => '~', @@ -397,9 +438,8 @@ my %entities = ( 255 => '\\"{y}', 'yuml' => '\\"{y}', - # hbar entity number comes from the unicode charater: - # see e.g. http://www.unicode.org/charts/PDF/U0100.pdf - # ISO also documents a 'planck' entity. + + # hbar entity number comes from the unicode character: 295 => '\ensuremath{\hbar}', 'planck' => '\ensuremath{\hbar}', @@ -417,7 +457,6 @@ my %entities = ( 376 => '\\"{Y}', 'Yuml' => '\\"{Y}', - # Latin extended B HTML 4.01 entities 402 => '\ensuremath{f}', @@ -466,9 +505,6 @@ my %entities = ( 'Pi' => '\ensuremath{\Pi}', 929 => '\ensuremath{\mathrm{P}}', 'Rho' => '\ensuremath{\mathrm{P}}', - - # Skips 930 - 931 => '\ensuremath{\Sigma}', 'Sigma' => '\ensuremath{\Sigma}', 932 => '\ensuremath{\mathrm{T}}', @@ -484,7 +520,6 @@ my %entities = ( 937 => '\ensuremath{\Omega}', 'Omega' => '\ensuremath{\Omega}', - # Greek lowercase: 945 => '\ensuremath{\alpha}', @@ -544,7 +579,6 @@ my %entities = ( 982 => '\ensuremath{\varpi}', 'piv' => '\ensuremath{\varpi}', - # The general punctuation set: 8194, => '\hspace{.5em}', @@ -603,7 +637,6 @@ my %entities = ( 'euro' => '\texteuro', # Letter like symbols - 8472 => '\ensuremath{\wp}', 'weierp' => '\ensuremath{\wp}', @@ -618,14 +651,14 @@ my %entities = ( # Arrows and then some (harpoons from Hon Kie). - 8592 => '\textleftarrow', - 'larr' => '\textleftarrow', - 8593 => '\textuparrow', - 'uarr' => '\textuparrow', - 8594 => '\textrightarrow', - 'rarr' => '\textrightarrow', - 8595 => '\textdownarrow', - 'darr' => '\textdownarrow', + 8592 => '\ensuremath{\leftarrow}', + 'larr' => '\ensuremath{\leftarrow}', + 8593 => '\ensuremath{\uparrow}', + 'uarr' => '\ensuremath{\uparrow}', + 8594 => '\ensuremath{\rightarrow}', + 'rarr' => '\ensuremath{\rightarrow}', + 8595 => '\ensuremath{\downarrow}', + 'darr' => '\ensuremath{\downarrow}', 8596 => '\ensuremath{\leftrightarrow}', 'harr' => '\ensuremath{\leftrightarrow}', 8598 => '\ensuremath{\nwarrow}', @@ -662,9 +695,7 @@ my %entities = ( 8669 => '\ensuremath{\rightsquigarrow}', 'rarrw' => '\ensuremath{\rightsquigarrow}', - # Mathematical operators. - 'forall' => '\ensuremath{\forall}', 8704 => '\ensuremath{\forall}', @@ -711,8 +742,13 @@ my %entities = ( 8733 => '\ensuremath{\propto}', 'infin' => '\ensuremath{\infty}', 8734 => '\ensuremath{\infty}', -# 'ang90' => '\ensuremath{\sqangle}', -# 8735 => '\ensuremath{\sqangle}', + + # The items below require the isoent latex package which I can't find at least for FC5. + # Temporarily commented out. + + 'ang90' => '\ensuremath{\sqangle}', + 8735 => '\ensuremath{\sqangle}', + 'ang' => '\ensuremath{\angle}', 8736 => '\ensuremath{\angle}', 'angmsd' => '\ensuremath{\measuredangle}', @@ -921,78 +957,64 @@ my %entities = ( ); -# -# Convert a numerical entity (that does not exist in our hash) -# to its UTF-8 equivalent representation. -# This allows us to support, to some extent, any entity for which -# dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). -# -# Parameters: -# unicode - The unicode for the character. This is assumed to -# be a decimal value -# Returns: -# The UTF-8 equiavalent of the value. -# +=pod + +=head1 UNICODE TABLE + +=over + + There are some named entities that don't have a good + latex equivalent, these are converted to utf-8 via this table + of entity name -> unicode number. + +=back + +=cut + +my %utf_table = ( + 'THORN' => 222, + 'thorn' => 254, + 'eth' => 240, + 'hearts' => 9829 +); + sub entity_to_utf8 { my ($unicode) = @_; - - return pack("U", $unicode); + my $result = pack("U", $unicode); + return $result; } -# -# Convert an entity to the corresponding LateX if possible. -# If not possible, and the entity is numeric, -# the entity is treated like a Unicode character and converted -# to UTF-8 which should display as long as dvipdf can find the -# appropriate glyph. -# -# The entity is assumed to have already had the -# &# ; or & ; removed -# -# Parameters: -# entity - Name of entity to convert. -# Returns: -# One of the following: -# - Latex string that produces the entity. -# - UTF-8 equivalent of a numeric entity for which we don't have a latex string. -# - ' ' for text entities for which there's no latex equivalent. -# + sub entity_to_latex { my ($entity) = @_; # Try to look up the entity (text or numeric) in the hash: + my $latex = $entities{"$entity"}; if (defined $latex) { return $latex; } # If the text is purely numeric we can do the UTF-8 conversion: - - if ($entity =~ /^\d$/) { + # Otherwise there are a few textual entities that don't have good latex + # which can be converted to unicode: + # + if ($entity =~ /^\d+$/) { return &entity_to_utf8($entity); + } else { + my $result = $utf_table{"$entity"}; + if (defined $result) { + return &entity_to_utf8($result); + } } # Can't do the conversion`< ... return " "; } -# -# Convert all the entities in a string. -# We locate all the entities, pass them into entity_to_latex and -# and replace occurences in the input string. -# The assumption is that there are few entities in any string/document -# so this looping is not too bad. The advantage of looping vs. regexping is -# that we now can use lookup tables for the translation in entity_to_latex above. -# -# Parameters: -# input - Input string/document -# Returns -# input with entities replaced by latexable stuff (UTF-8 encodings or -# latex control strings to produce the entity. -# -# + sub replace_entities { my ($input) = @_; my $start; @@ -1009,6 +1031,18 @@ sub replace_entities { $latex = &entity_to_latex($entity); substr($input, $start, $end-$start) = $latex; } + + # Hexadecimal entities: + + while ($input =~ /&\#x(\d|[a-f,A-f])+;/) { + ($start) = @-; + ($end) = @+; + $entity = "0" . substr($input, $start+2, $end-$start-3); # 0xhexnumber + $latex = &entity_to_latex(hex($entity)); + substr($input, $start, $end-$start) = $latex; + } + + # Now the &text; entites; while ($input =~/(&\w+;)/) { @@ -1025,3 +1059,108 @@ sub replace_entities { 1; __END__ + +=pod + +=head1 NAME + +Apache::entities.pm + +=head1 SYNOPSIS + +This file contains a table driven entity-->latex converter. + +This is part of the LearningOnline Network with CAPA project +described at http://www.lon-capa.org. + +=head1 OVERVIEW + + +Assumptions: + The number of entities in a resource is small compared with the + number of possible entities that might be translated. + Therefore the strategy is to match a general entity pattern + &.+; over and over, pull out the match look it up in an entity -> tex hash + and do the replacement. + +In order to simplify the hash, the following reductions are done: + &#d+; have the &# and ; stripped and is converted to an int. + &#.+; have the &#x and ; stripped and is converted to an int as a hex + value. + All others have the & and ; stripped. + + +The hash: Add new conversions here; leave off the leading & and the trailing ; +all numeric entities need only appear as their decimal versions +(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. + +This entity table is mercilessly cribbed from the HTML pocket reference +table starting at pg 82. In most cases the LaTeX equivalent codes come from +the original massive regular expression replacements originally by +A. Sakharuk in lonprintout.pm + +I also want to acknowledge + ISO Character entities and their LaTeX equivalents by + Vidar Bronken Gundersen, and Rune Mathisen + http://www.bitjungle.com/isoent-ref.pdf + + +Note numerical entities are essentially unicode character codes. + + +=head1 SUBROUTINES + +=over + +=item entity_to_utf8() + + +Convert a numerical entity (that does not exist in our hash) + to its UTF-8 equivalent representation. + This allows us to support, to some extent, any entity for which + dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). + +Parameters: + unicode - The unicode for the character. This is assumed to + be a decimal value +Returns: + The UTF-8 equiavalent of the value. + +=item entity_to_latex() + + Convert an entity to the corresponding LateX if possible. + If not possible, and the entity is numeric, + the entity is treated like a Unicode character and converted + to UTF-8 which should display as long as dvipdf can find the + appropriate glyph. + + The entity is assumed to have already had the + &; or & ; removed + +Parameters: + entity - Name of entity to convert. +Returns: + One of the following: + - Latex string that produces the entity. + - UTF-8 equivalent of a numeric entity for which we don't have a latex string. + - ' ' for text entities for which there's no latex equivalent. + + +=item replace_entities() + + Convert all the entities in a string. + We locate all the entities, pass them into entity_to_latex and + and replace occurences in the input string. + The assumption is that there are few entities in any string/document + so this looping is not too bad. The advantage of looping vs. regexping is + that we now can use lookup tables for the translation in entity_to_latex above. + +Parameters: + input - Input string/document +Returns + input with entities replaced by latexable stuff (UTF-8 encodings or + latex control strings to produce the entity. + +=back + +=cut