--- loncom/interface/entities.pm 2008/10/10 10:16:53 1.12 +++ loncom/interface/entities.pm 2008/11/17 13:52:39 1.13 @@ -25,57 +25,139 @@ # http://www.lon-capa.org/ # # -package Apache::entities; -use strict; -# -# This file contains a table driven entity-->latex converter. -# -# Assumptions: -# The number of entities in a resource is small compared with the -# number of possible entities that might be translated. -# Therefore the strategy is to match a general entity pattern -# &.+; over and over, pull out the match look it up in an entity -> tex hash -# and do the replacement. -# -# In order to simplify the hash, the following reductions are done: -# &#d+; have the &# and ; stripped and is converted to an int. -# &#.+; have the &#x and ; stripped and is converted to an int as a hex -# value. -# All others have the & and ; stripped. -# The hash: Add new conversions here; leave off the leading & and the trailing ; -# all numeric entities need only appear as their decimal versions -# (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. -# -# This entity table is mercilessly cribbed from the HTML pocket reference -# table starting at pg 82. In most cases the LaTeX equivalent codes come from -# the original massive regular expression replacements originally by -# A. Sakharuk in lonprintout.pm -# -# I also want to acknowledge -# ISO Character entities and their LaTeX equivalents by -# Vidar Bronken Gundersen, and Rune Mathisen -# http://www.bitjungle.com/isoent-ref.pdf -# +=head1 NAME + +Apache::entities.pm + +=head1 SYNOPSIS + +This file contains a table driven entity-->latex converter. + +This is part of the LearningOnline Network with CAPA project +described at http://www.lon-capa.org. + +=head1 OVERVIEW + + +Assumptions: + The number of entities in a resource is small compared with the + number of possible entities that might be translated. + Therefore the strategy is to match a general entity pattern + &.+; over and over, pull out the match look it up in an entity -> tex hash + and do the replacement. + +In order to simplify the hash, the following reductions are done: + &#d+; have the &# and ; stripped and is converted to an int. + &#.+; have the &#x and ; stripped and is converted to an int as a hex + value. + All others have the & and ; stripped. + + +The hash: Add new conversions here; leave off the leading & and the trailing ; +all numeric entities need only appear as their decimal versions +(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. + +This entity table is mercilessly cribbed from the HTML pocket reference +table starting at pg 82. In most cases the LaTeX equivalent codes come from +the original massive regular expression replacements originally by +A. Sakharuk in lonprintout.pm + +I also want to acknowledge + ISO Character entities and their LaTeX equivalents by + Vidar Bronken Gundersen, and Rune Mathisen + http://www.bitjungle.com/isoent-ref.pdf + + +Note numerical entities are essentially unicode character codes. + + +=head1 SUBROUTINES + +=item entity_to_utf8() + + +Convert a numerical entity (that does not exist in our hash) + to its UTF-8 equivalent representation. + This allows us to support, to some extent, any entity for which + dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). + +Parameters: + unicode - The unicode for the character. This is assumed to + be a decimal value +Returns: + The UTF-8 equiavalent of the value. + +=item entity_to_latex() + + Convert an entity to the corresponding LateX if possible. + If not possible, and the entity is numeric, + the entity is treated like a Unicode character and converted + to UTF-8 which should display as long as dvipdf can find the + appropriate glyph. + + The entity is assumed to have already had the + &; or & ; removed + +Parameters: + entity - Name of entity to convert. +Returns: + One of the following: + - Latex string that produces the entity. + - UTF-8 equivalent of a numeric entity for which we don't have a latex string. + - ' ' for text entities for which there's no latex equivalent. + + +=item replace_entities() + + Convert all the entities in a string. + We locate all the entities, pass them into entity_to_latex and + and replace occurences in the input string. + The assumption is that there are few entities in any string/document + so this looping is not too bad. The advantage of looping vs. regexping is + that we now can use lookup tables for the translation in entity_to_latex above. + +Parameters: + input - Input string/document +Returns + input with entities replaced by latexable stuff (UTF-8 encodings or + latex control strings to produce the entity. + +=head1 TABLES ASCII code page + +=cut + + +package Apache::entities; +use strict; -# Note numerical entities are essentially unicode character codes. -# package Apache::entities; my %entities = ( - # ---- ASCII code page: ---------------- +=pod + +=out + +=item (7-13) # Translation to empty strings: +=cut 7 => "", 9 => "", 10 => "", 13 => "", +=pod + +=item (32-126) + # Translations to simple characters: +=cut + 32 => " ", 33 => "!", 34 => '"', @@ -176,9 +258,15 @@ my %entities = ( 125 => '\}', 126 => '\~', - # Controls and Latin-1 supplement. Note that some entities that have - # visible effect are not printing unicode characters. Specifically - # ‚-  +=pod + +=item (130-140) + + Controls and Latin-1 supplement. Note that some entities that have + visible effect are not printing unicode characters. Specifically + ‚-  + +=cut 130 => ',', 131 => '\ensuremath{f}', @@ -192,7 +280,13 @@ my %entities = ( 139 => '\ensuremath{<}', 140 => '{\OE}', - # There's a gap here in my entity table +=pod + +=item (145-156) + + There's a gap here in my entity table + +=cut 145 => '`', 146 => '\'', @@ -206,8 +300,14 @@ my %entities = ( 154 => '\v{s}', 155 => '\ensuremath{>}', 156 => '\oe ', - - # Another short gap: + +=pod + +=item (159-255) + + Another short gap: + +=cut 159 => '\"Y', 160 => '~', @@ -397,14 +497,27 @@ my %entities = ( 255 => '\\"{y}', 'yuml' => '\\"{y}', - # hbar entity number comes from the unicode charater: - # see e.g. http://www.unicode.org/charts/PDF/U0100.pdf - # ISO also documents a 'planck' entity. + +=pod + +=item (295) + + hbar entity number comes from the unicode charater: + see e.g. http://www.unicode.org/charts/PDF/U0100.pdf + ISO also documents a 'planck' entity. + +=cut 295 => '\ensuremath{\hbar}', 'planck' => '\ensuremath{\hbar}', - # Latin extended-A HTML 4.01 entities: +=pod + +=item (338-376) + + Latin extended-A HTML 4.01 entities: + +=cut 338 => '\OE', 'OElig' => '\OE', @@ -417,20 +530,37 @@ my %entities = ( 376 => '\\"{Y}', 'Yuml' => '\\"{Y}', +=pod + +=item (402) - # Latin extended B HTML 4.01 entities + Latin extended B HTML 4.01 entities + +=cut 402 => '\ensuremath{f}', 'fnof' => '\ensuremath{f}', - # Spacing modifier letters: +=pod + +=item (710 & 732) + + Spacing modifier letters: + +=cut 710 => '\^{}', 'circ' => '\^{}', 732 => '\~{}', 'tilde' => '\~{}', - # Greek uppercase: +=pod + +=item (913-929) + + Greek uppercase: + +=cut 913 => '\ensuremath{\mathrm{A}}', 'Alpha' => '\ensuremath{\mathrm{A}}', @@ -467,7 +597,14 @@ my %entities = ( 929 => '\ensuremath{\mathrm{P}}', 'Rho' => '\ensuremath{\mathrm{P}}', - # Skips 930 + +=pod + +=item (931-937) + + Skips 930 + +=cut 931 => '\ensuremath{\Sigma}', 'Sigma' => '\ensuremath{\Sigma}', @@ -484,8 +621,13 @@ my %entities = ( 937 => '\ensuremath{\Omega}', 'Omega' => '\ensuremath{\Omega}', +=pod - # Greek lowercase: +=item (945-982) + + Greek lowercase: + +=cut 945 => '\ensuremath{\alpha}', 'alpha' => '\ensuremath{\alpha}', @@ -544,8 +686,13 @@ my %entities = ( 982 => '\ensuremath{\varpi}', 'piv' => '\ensuremath{\varpi}', +=pod + +=item (8194-8364) - # The general punctuation set: + The general punctuation set: + +=cut 8194, => '\hspace{.5em}', 'enspc' => '\hspace{.5em}', @@ -602,7 +749,13 @@ my %entities = ( 8364 => '\texteuro', 'euro' => '\texteuro', - # Letter like symbols +=pod + +=item (8472-8501) + + Letter like symbols + +=cut 8472 => '\ensuremath{\wp}', @@ -616,7 +769,13 @@ my %entities = ( 8501 => '\ensuremath{\aleph}', 'alefsym'=> '\ensuremath{\aleph}', - # Arrows and then some (harpoons from Hon Kie). +=pod + +=item (8592-8669) + + Arrows and then some (harpoons from Hon Kie). + +=cut 8592 => '\textleftarrow', 'larr' => '\textleftarrow', @@ -662,8 +821,13 @@ my %entities = ( 8669 => '\ensuremath{\rightsquigarrow}', 'rarrw' => '\ensuremath{\rightsquigarrow}', +=pod + +=item (8704-8734) - # Mathematical operators. + Mathematical operators. + +=cut 'forall' => '\ensuremath{\forall}', @@ -711,12 +875,21 @@ my %entities = ( 8733 => '\ensuremath{\propto}', 'infin' => '\ensuremath{\infty}', 8734 => '\ensuremath{\infty}', -# -# The items below require the isoent latex package which I can't find at least for FC5. -# Temporarily commented out. -# -# 'ang90' => '\ensuremath{\sqangle}', -# 8735 => '\ensuremath{\sqangle}', + + +=pod + +=item (8735-9830) + + + The items below require the isoent latex package which I can't find at least for FC5. + Temporarily commented out. + + 'ang90' => '\ensuremath{\sqangle}', + 8735 => '\ensuremath{\sqangle}', + +=cut + 'ang' => '\ensuremath{\angle}', 8736 => '\ensuremath{\angle}', 'angmsd' => '\ensuremath{\measuredangle}', @@ -925,9 +1098,15 @@ my %entities = ( ); -# There are some named entities that don't have a good -# latex equivalent, these are converted to utf-8 via this table -# of entity name -> unicode number. +=pod + +=item * + + There are some named entities that don't have a good + latex equivalent, these are converted to utf-8 via this table + of entity name -> unicode number. + +=cut my %utf_table = ( 'THORN' => 222, @@ -936,18 +1115,6 @@ my %utf_table = ( 'hearts' => 9829 ); -# -# Convert a numerical entity (that does not exist in our hash) -# to its UTF-8 equivalent representation. -# This allows us to support, to some extent, any entity for which -# dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). -# -# Parameters: -# unicode - The unicode for the character. This is assumed to -# be a decimal value -# Returns: -# The UTF-8 equiavalent of the value. -# sub entity_to_utf8 { my ($unicode) = @_; my $result = pack("U", $unicode); @@ -955,24 +1122,7 @@ sub entity_to_utf8 { } -# -# Convert an entity to the corresponding LateX if possible. -# If not possible, and the entity is numeric, -# the entity is treated like a Unicode character and converted -# to UTF-8 which should display as long as dvipdf can find the -# appropriate glyph. -# -# The entity is assumed to have already had the -# &# ; or & ; removed -# -# Parameters: -# entity - Name of entity to convert. -# Returns: -# One of the following: -# - Latex string that produces the entity. -# - UTF-8 equivalent of a numeric entity for which we don't have a latex string. -# - ' ' for text entities for which there's no latex equivalent. -# + sub entity_to_latex { my ($entity) = @_; @@ -1001,21 +1151,7 @@ sub entity_to_latex { return " "; } -# -# Convert all the entities in a string. -# We locate all the entities, pass them into entity_to_latex and -# and replace occurences in the input string. -# The assumption is that there are few entities in any string/document -# so this looping is not too bad. The advantage of looping vs. regexping is -# that we now can use lookup tables for the translation in entity_to_latex above. -# -# Parameters: -# input - Input string/document -# Returns -# input with entities replaced by latexable stuff (UTF-8 encodings or -# latex control strings to produce the entity. -# -# + sub replace_entities { my ($input) = @_; my $start; @@ -1060,3 +1196,9 @@ sub replace_entities { 1; __END__ + +=pod + +=back + +=cut