--- loncom/interface/entities.pm 2008/11/17 20:24:25 1.15 +++ loncom/interface/entities.pm 2014/06/19 19:18:17 1.20 @@ -1,7 +1,7 @@ # The LearningOnline Network # entity -> tex. # -# +# $Id: entities.pm,v 1.20 2014/06/19 19:18:17 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # @@ -26,142 +26,99 @@ # # -=pod - -=head1 NAME +package Apache::entities; +use strict; -Apache::entities.pm +=pod -=head1 SYNOPSIS +=head1 TABLES ASCII code page -This file contains a table driven entity-->latex converter. +=over -This is part of the LearningOnline Network with CAPA project -described at http://www.lon-capa.org. +=item (7-13) -=head1 OVERVIEW + Translation to empty strings +=item (32-126) -Assumptions: - The number of entities in a resource is small compared with the - number of possible entities that might be translated. - Therefore the strategy is to match a general entity pattern - &.+; over and over, pull out the match look it up in an entity -> tex hash - and do the replacement. + Translations to simple characters -In order to simplify the hash, the following reductions are done: - &#d+; have the &# and ; stripped and is converted to an int. - &#.+; have the &#x and ; stripped and is converted to an int as a hex - value. - All others have the & and ; stripped. +=item (130-140) + Controls and Latin-1 supplement. Note that some entities that have + visible effect are not printing unicode characters. Specifically + ‚-  -The hash: Add new conversions here; leave off the leading & and the trailing ; -all numeric entities need only appear as their decimal versions -(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. +=item (145-156) -This entity table is mercilessly cribbed from the HTML pocket reference -table starting at pg 82. In most cases the LaTeX equivalent codes come from -the original massive regular expression replacements originally by -A. Sakharuk in lonprintout.pm + There's a gap here in my entity table -I also want to acknowledge - ISO Character entities and their LaTeX equivalents by - Vidar Bronken Gundersen, and Rune Mathisen - http://www.bitjungle.com/isoent-ref.pdf +=item (159-255) + Another short gap -Note numerical entities are essentially unicode character codes. +=item (295) + hbar entity number comes from the unicode character: + see e.g. http://www.unicode.org/charts/PDF/U0100.pdf + ISO also documents a 'planck' entity. -=head1 SUBROUTINES +=item (338-376) -=over + Latin extended-A HTML 4.01 entities -=item entity_to_utf8() +=item (402) + Latin extended B HTML 4.01 entities -Convert a numerical entity (that does not exist in our hash) - to its UTF-8 equivalent representation. - This allows us to support, to some extent, any entity for which - dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). +=item (710 & 732) -Parameters: - unicode - The unicode for the character. This is assumed to - be a decimal value -Returns: - The UTF-8 equiavalent of the value. + Spacing modifier letters -=item entity_to_latex() +=item (913-937) - Convert an entity to the corresponding LateX if possible. - If not possible, and the entity is numeric, - the entity is treated like a Unicode character and converted - to UTF-8 which should display as long as dvipdf can find the - appropriate glyph. + Greek uppercase (skipss 930) - The entity is assumed to have already had the - &; or & ; removed +=item (945-982) -Parameters: - entity - Name of entity to convert. -Returns: - One of the following: - - Latex string that produces the entity. - - UTF-8 equivalent of a numeric entity for which we don't have a latex string. - - ' ' for text entities for which there's no latex equivalent. + Greek lowercase +=item (8194-8364) -=item replace_entities() + The general punctuation set - Convert all the entities in a string. - We locate all the entities, pass them into entity_to_latex and - and replace occurences in the input string. - The assumption is that there are few entities in any string/document - so this looping is not too bad. The advantage of looping vs. regexping is - that we now can use lookup tables for the translation in entity_to_latex above. +=item (8472-8501) -Parameters: - input - Input string/document -Returns - input with entities replaced by latexable stuff (UTF-8 encodings or - latex control strings to produce the entity. + Letter like symbols -=back +=item (8592-8669) -=head1 TABLES ASCII code page + Arrows and then some (harpoons from Hon Kie). -=cut +=item (8704-8734) -package Apache::entities; -use strict; + Mathematical operators. -package Apache::entities; +=item (8735-9830) -my %entities = ( + The items below require the isoent latex package which I can't find at least for FC5. + Temporarily commented out. -=pod +=back -=over +=cut -=item (7-13) +my %entities = ( # Translation to empty strings: -=cut 7 => "", 9 => "", 10 => "", 13 => "", -=pod - -=item (32-126) - # Translations to simple characters: -=cut - 32 => " ", 33 => "!", 34 => '"', @@ -262,15 +219,7 @@ my %entities = ( 125 => '\}', 126 => '\~', -=pod - -=item (130-140) - - Controls and Latin-1 supplement. Note that some entities that have - visible effect are not printing unicode characters. Specifically - ‚-  - -=cut + # Controls and Latin-1 supplement. 130 => ',', 131 => '\ensuremath{f}', @@ -284,13 +233,7 @@ my %entities = ( 139 => '\ensuremath{<}', 140 => '{\OE}', -=pod - -=item (145-156) - - There's a gap here in my entity table - -=cut + # There's a gap here in my entity table 145 => '`', 146 => '\'', @@ -305,13 +248,7 @@ my %entities = ( 155 => '\ensuremath{>}', 156 => '\oe ', -=pod - -=item (159-255) - - Another short gap: - -=cut + # Another short gap: 159 => '\"Y', 160 => '~', @@ -502,26 +439,12 @@ my %entities = ( 'yuml' => '\\"{y}', -=pod - -=item (295) - - hbar entity number comes from the unicode charater: - see e.g. http://www.unicode.org/charts/PDF/U0100.pdf - ISO also documents a 'planck' entity. - -=cut + # hbar entity number comes from the unicode character: 295 => '\ensuremath{\hbar}', 'planck' => '\ensuremath{\hbar}', -=pod - -=item (338-376) - - Latin extended-A HTML 4.01 entities: - -=cut + # Latin extended-A HTML 4.01 entities: 338 => '\OE', 'OElig' => '\OE', @@ -534,37 +457,19 @@ my %entities = ( 376 => '\\"{Y}', 'Yuml' => '\\"{Y}', -=pod - -=item (402) - - Latin extended B HTML 4.01 entities - -=cut + # Latin extended B HTML 4.01 entities 402 => '\ensuremath{f}', 'fnof' => '\ensuremath{f}', -=pod - -=item (710 & 732) - - Spacing modifier letters: - -=cut + # Spacing modifier letters: 710 => '\^{}', 'circ' => '\^{}', 732 => '\~{}', 'tilde' => '\~{}', -=pod - -=item (913-929) - - Greek uppercase: - -=cut + # Greek uppercase: 913 => '\ensuremath{\mathrm{A}}', 'Alpha' => '\ensuremath{\mathrm{A}}', @@ -600,16 +505,6 @@ my %entities = ( 'Pi' => '\ensuremath{\Pi}', 929 => '\ensuremath{\mathrm{P}}', 'Rho' => '\ensuremath{\mathrm{P}}', - - -=pod - -=item (931-937) - - Skips 930 - -=cut - 931 => '\ensuremath{\Sigma}', 'Sigma' => '\ensuremath{\Sigma}', 932 => '\ensuremath{\mathrm{T}}', @@ -625,13 +520,7 @@ my %entities = ( 937 => '\ensuremath{\Omega}', 'Omega' => '\ensuremath{\Omega}', -=pod - -=item (945-982) - - Greek lowercase: - -=cut + # Greek lowercase: 945 => '\ensuremath{\alpha}', 'alpha' => '\ensuremath{\alpha}', @@ -690,13 +579,7 @@ my %entities = ( 982 => '\ensuremath{\varpi}', 'piv' => '\ensuremath{\varpi}', -=pod - -=item (8194-8364) - - The general punctuation set: - -=cut + # The general punctuation set: 8194, => '\hspace{.5em}', 'enspc' => '\hspace{.5em}', @@ -753,14 +636,7 @@ my %entities = ( 8364 => '\texteuro', 'euro' => '\texteuro', -=pod - -=item (8472-8501) - - Letter like symbols - -=cut - + # Letter like symbols 8472 => '\ensuremath{\wp}', 'weierp' => '\ensuremath{\wp}', @@ -773,22 +649,17 @@ my %entities = ( 8501 => '\ensuremath{\aleph}', 'alefsym'=> '\ensuremath{\aleph}', -=pod - -=item (8592-8669) - - Arrows and then some (harpoons from Hon Kie). - -=cut + # Arrows and then some (harpoons from Hon Kie). - 8592 => '\textleftarrow', - 'larr' => '\textleftarrow', - 8593 => '\textuparrow', - 'uarr' => '\textuparrow', - 8594 => '\textrightarrow', - 'rarr' => '\textrightarrow', - 8595 => '\textdownarrow', - 'darr' => '\textdownarrow', + 8592 => '\ensuremath{\leftarrow}', + 'larr' => '\ensuremath{\leftarrow}', + 8593 => '\ensuremath{\uparrow}', + 'uarr' => '\ensuremath{\uparrow}', + 8594 => '\ensuremath{\rightarrow}', + 'rarr' => '\ensuremath{\rightarrow}', + 'rightarrow' => '\ensuremath{\rightarrow}', + 8595 => '\ensuremath{\downarrow}', + 'darr' => '\ensuremath{\downarrow}', 8596 => '\ensuremath{\leftrightarrow}', 'harr' => '\ensuremath{\leftrightarrow}', 8598 => '\ensuremath{\nwarrow}', @@ -825,14 +696,7 @@ my %entities = ( 8669 => '\ensuremath{\rightsquigarrow}', 'rarrw' => '\ensuremath{\rightsquigarrow}', -=pod - -=item (8704-8734) - - Mathematical operators. - -=cut - + # Mathematical operators. 'forall' => '\ensuremath{\forall}', 8704 => '\ensuremath{\forall}', @@ -880,20 +744,12 @@ my %entities = ( 'infin' => '\ensuremath{\infty}', 8734 => '\ensuremath{\infty}', - -=pod - -=item (8735-9830) - - - The items below require the isoent latex package which I can't find at least for FC5. - Temporarily commented out. + # The items below require the isoent latex package which I can't find at least for FC5. + # Temporarily commented out. 'ang90' => '\ensuremath{\sqangle}', 8735 => '\ensuremath{\sqangle}', -=cut - 'ang' => '\ensuremath{\angle}', 8736 => '\ensuremath{\angle}', 'angmsd' => '\ensuremath{\measuredangle}', @@ -1104,7 +960,9 @@ my %entities = ( =pod -=item * +=head1 UNICODE TABLE + +=over There are some named entities that don't have a good latex equivalent, these are converted to utf-8 via this table @@ -1202,3 +1060,108 @@ sub replace_entities { 1; __END__ + +=pod + +=head1 NAME + +Apache::entities.pm + +=head1 SYNOPSIS + +This file contains a table driven entity-->latex converter. + +This is part of the LearningOnline Network with CAPA project +described at http://www.lon-capa.org. + +=head1 OVERVIEW + + +Assumptions: + The number of entities in a resource is small compared with the + number of possible entities that might be translated. + Therefore the strategy is to match a general entity pattern + &.+; over and over, pull out the match look it up in an entity -> tex hash + and do the replacement. + +In order to simplify the hash, the following reductions are done: + &#d+; have the &# and ; stripped and is converted to an int. + &#.+; have the &#x and ; stripped and is converted to an int as a hex + value. + All others have the & and ; stripped. + + +The hash: Add new conversions here; leave off the leading & and the trailing ; +all numeric entities need only appear as their decimal versions +(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. + +This entity table is mercilessly cribbed from the HTML pocket reference +table starting at pg 82. In most cases the LaTeX equivalent codes come from +the original massive regular expression replacements originally by +A. Sakharuk in lonprintout.pm + +I also want to acknowledge + ISO Character entities and their LaTeX equivalents by + Vidar Bronken Gundersen, and Rune Mathisen + http://www.bitjungle.com/isoent-ref.pdf + + +Note numerical entities are essentially unicode character codes. + + +=head1 SUBROUTINES + +=over + +=item entity_to_utf8() + + +Convert a numerical entity (that does not exist in our hash) + to its UTF-8 equivalent representation. + This allows us to support, to some extent, any entity for which + dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). + +Parameters: + unicode - The unicode for the character. This is assumed to + be a decimal value +Returns: + The UTF-8 equiavalent of the value. + +=item entity_to_latex() + + Convert an entity to the corresponding LateX if possible. + If not possible, and the entity is numeric, + the entity is treated like a Unicode character and converted + to UTF-8 which should display as long as dvipdf can find the + appropriate glyph. + + The entity is assumed to have already had the + &; or & ; removed + +Parameters: + entity - Name of entity to convert. +Returns: + One of the following: + - Latex string that produces the entity. + - UTF-8 equivalent of a numeric entity for which we don't have a latex string. + - ' ' for text entities for which there's no latex equivalent. + + +=item replace_entities() + + Convert all the entities in a string. + We locate all the entities, pass them into entity_to_latex and + and replace occurences in the input string. + The assumption is that there are few entities in any string/document + so this looping is not too bad. The advantage of looping vs. regexping is + that we now can use lookup tables for the translation in entity_to_latex above. + +Parameters: + input - Input string/document +Returns + input with entities replaced by latexable stuff (UTF-8 encodings or + latex control strings to produce the entity. + +=back + +=cut