--- loncom/interface/entities.pm 2008/11/17 13:52:39 1.13 +++ loncom/interface/entities.pm 2014/08/25 18:02:48 1.21 @@ -1,7 +1,7 @@ # The LearningOnline Network # entity -> tex. # -# +# $Id: entities.pm,v 1.21 2014/08/25 18:02:48 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # @@ -26,138 +26,99 @@ # # +package Apache::entities; +use strict; -=head1 NAME +=pod -Apache::entities.pm +=head1 TABLES ASCII code page -=head1 SYNOPSIS +=over -This file contains a table driven entity-->latex converter. +=item (7-13) -This is part of the LearningOnline Network with CAPA project -described at http://www.lon-capa.org. + Translation to empty strings -=head1 OVERVIEW +=item (32-126) + Translations to simple characters -Assumptions: - The number of entities in a resource is small compared with the - number of possible entities that might be translated. - Therefore the strategy is to match a general entity pattern - &.+; over and over, pull out the match look it up in an entity -> tex hash - and do the replacement. +=item (130-140) -In order to simplify the hash, the following reductions are done: - &#d+; have the &# and ; stripped and is converted to an int. - &#.+; have the &#x and ; stripped and is converted to an int as a hex - value. - All others have the & and ; stripped. + Controls and Latin-1 supplement. Note that some entities that have + visible effect are not printing unicode characters. Specifically + ‚-  +=item (145-156) -The hash: Add new conversions here; leave off the leading & and the trailing ; -all numeric entities need only appear as their decimal versions -(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. + There's a gap here in my entity table -This entity table is mercilessly cribbed from the HTML pocket reference -table starting at pg 82. In most cases the LaTeX equivalent codes come from -the original massive regular expression replacements originally by -A. Sakharuk in lonprintout.pm +=item (159-255) -I also want to acknowledge - ISO Character entities and their LaTeX equivalents by - Vidar Bronken Gundersen, and Rune Mathisen - http://www.bitjungle.com/isoent-ref.pdf + Another short gap +=item (295) -Note numerical entities are essentially unicode character codes. + hbar entity number comes from the unicode character: + see e.g. http://www.unicode.org/charts/PDF/U0100.pdf + ISO also documents a 'planck' entity. +=item (338-376) -=head1 SUBROUTINES + Latin extended-A HTML 4.01 entities -=item entity_to_utf8() +=item (402) + Latin extended B HTML 4.01 entities -Convert a numerical entity (that does not exist in our hash) - to its UTF-8 equivalent representation. - This allows us to support, to some extent, any entity for which - dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). +=item (710 & 732) -Parameters: - unicode - The unicode for the character. This is assumed to - be a decimal value -Returns: - The UTF-8 equiavalent of the value. + Spacing modifier letters -=item entity_to_latex() +=item (913-937) - Convert an entity to the corresponding LateX if possible. - If not possible, and the entity is numeric, - the entity is treated like a Unicode character and converted - to UTF-8 which should display as long as dvipdf can find the - appropriate glyph. + Greek uppercase (skipss 930) - The entity is assumed to have already had the - &; or & ; removed +=item (945-982) -Parameters: - entity - Name of entity to convert. -Returns: - One of the following: - - Latex string that produces the entity. - - UTF-8 equivalent of a numeric entity for which we don't have a latex string. - - ' ' for text entities for which there's no latex equivalent. + Greek lowercase +=item (8194-8364) -=item replace_entities() + The general punctuation set - Convert all the entities in a string. - We locate all the entities, pass them into entity_to_latex and - and replace occurences in the input string. - The assumption is that there are few entities in any string/document - so this looping is not too bad. The advantage of looping vs. regexping is - that we now can use lookup tables for the translation in entity_to_latex above. +=item (8472-8501) -Parameters: - input - Input string/document -Returns - input with entities replaced by latexable stuff (UTF-8 encodings or - latex control strings to produce the entity. + Letter like symbols -=head1 TABLES ASCII code page +=item (8592-8669) -=cut + Arrows and then some (harpoons from Hon Kie). +=item (8704-8734) -package Apache::entities; -use strict; + Mathematical operators. -package Apache::entities; +=item (8735-9830) -my %entities = ( + The items below require the isoent latex package which I can't find at least for FC5. + Temporarily commented out. -=pod +=back -=out +=cut -=item (7-13) +my %entities = ( # Translation to empty strings: -=cut 7 => "", 9 => "", 10 => "", 13 => "", -=pod - -=item (32-126) - # Translations to simple characters: -=cut - 32 => " ", 33 => "!", 34 => '"', @@ -258,15 +219,7 @@ my %entities = ( 125 => '\}', 126 => '\~', -=pod - -=item (130-140) - - Controls and Latin-1 supplement. Note that some entities that have - visible effect are not printing unicode characters. Specifically - ‚-  - -=cut + # Controls and Latin-1 supplement. 130 => ',', 131 => '\ensuremath{f}', @@ -280,13 +233,7 @@ my %entities = ( 139 => '\ensuremath{<}', 140 => '{\OE}', -=pod - -=item (145-156) - - There's a gap here in my entity table - -=cut + # There's a gap here in my entity table 145 => '`', 146 => '\'', @@ -301,13 +248,7 @@ my %entities = ( 155 => '\ensuremath{>}', 156 => '\oe ', -=pod - -=item (159-255) - - Another short gap: - -=cut + # Another short gap: 159 => '\"Y', 160 => '~', @@ -498,26 +439,12 @@ my %entities = ( 'yuml' => '\\"{y}', -=pod - -=item (295) - - hbar entity number comes from the unicode charater: - see e.g. http://www.unicode.org/charts/PDF/U0100.pdf - ISO also documents a 'planck' entity. - -=cut + # hbar entity number comes from the unicode character: 295 => '\ensuremath{\hbar}', 'planck' => '\ensuremath{\hbar}', -=pod - -=item (338-376) - - Latin extended-A HTML 4.01 entities: - -=cut + # Latin extended-A HTML 4.01 entities: 338 => '\OE', 'OElig' => '\OE', @@ -530,37 +457,19 @@ my %entities = ( 376 => '\\"{Y}', 'Yuml' => '\\"{Y}', -=pod - -=item (402) - - Latin extended B HTML 4.01 entities - -=cut + # Latin extended B HTML 4.01 entities 402 => '\ensuremath{f}', 'fnof' => '\ensuremath{f}', -=pod - -=item (710 & 732) - - Spacing modifier letters: - -=cut + # Spacing modifier letters: 710 => '\^{}', 'circ' => '\^{}', 732 => '\~{}', 'tilde' => '\~{}', -=pod - -=item (913-929) - - Greek uppercase: - -=cut + # Greek uppercase: 913 => '\ensuremath{\mathrm{A}}', 'Alpha' => '\ensuremath{\mathrm{A}}', @@ -596,16 +505,6 @@ my %entities = ( 'Pi' => '\ensuremath{\Pi}', 929 => '\ensuremath{\mathrm{P}}', 'Rho' => '\ensuremath{\mathrm{P}}', - - -=pod - -=item (931-937) - - Skips 930 - -=cut - 931 => '\ensuremath{\Sigma}', 'Sigma' => '\ensuremath{\Sigma}', 932 => '\ensuremath{\mathrm{T}}', @@ -621,13 +520,7 @@ my %entities = ( 937 => '\ensuremath{\Omega}', 'Omega' => '\ensuremath{\Omega}', -=pod - -=item (945-982) - - Greek lowercase: - -=cut + # Greek lowercase: 945 => '\ensuremath{\alpha}', 'alpha' => '\ensuremath{\alpha}', @@ -686,13 +579,7 @@ my %entities = ( 982 => '\ensuremath{\varpi}', 'piv' => '\ensuremath{\varpi}', -=pod - -=item (8194-8364) - - The general punctuation set: - -=cut + # The general punctuation set: 8194, => '\hspace{.5em}', 'enspc' => '\hspace{.5em}', @@ -749,14 +636,7 @@ my %entities = ( 8364 => '\texteuro', 'euro' => '\texteuro', -=pod - -=item (8472-8501) - - Letter like symbols - -=cut - + # Letter like symbols 8472 => '\ensuremath{\wp}', 'weierp' => '\ensuremath{\wp}', @@ -769,22 +649,17 @@ my %entities = ( 8501 => '\ensuremath{\aleph}', 'alefsym'=> '\ensuremath{\aleph}', -=pod - -=item (8592-8669) - - Arrows and then some (harpoons from Hon Kie). + # Arrows and then some (harpoons from Hon Kie). -=cut - - 8592 => '\textleftarrow', - 'larr' => '\textleftarrow', - 8593 => '\textuparrow', - 'uarr' => '\textuparrow', - 8594 => '\textrightarrow', - 'rarr' => '\textrightarrow', - 8595 => '\textdownarrow', - 'darr' => '\textdownarrow', + 8592 => '\ensuremath{\leftarrow}', + 'larr' => '\ensuremath{\leftarrow}', + 8593 => '\ensuremath{\uparrow}', + 'uarr' => '\ensuremath{\uparrow}', + 8594 => '\ensuremath{\rightarrow}', + 'rarr' => '\ensuremath{\rightarrow}', + 'rightarrow' => '\ensuremath{\rightarrow}', + 8595 => '\ensuremath{\downarrow}', + 'darr' => '\ensuremath{\downarrow}', 8596 => '\ensuremath{\leftrightarrow}', 'harr' => '\ensuremath{\leftrightarrow}', 8598 => '\ensuremath{\nwarrow}', @@ -821,14 +696,7 @@ my %entities = ( 8669 => '\ensuremath{\rightsquigarrow}', 'rarrw' => '\ensuremath{\rightsquigarrow}', -=pod - -=item (8704-8734) - - Mathematical operators. - -=cut - + # Mathematical operators. 'forall' => '\ensuremath{\forall}', 8704 => '\ensuremath{\forall}', @@ -876,20 +744,12 @@ my %entities = ( 'infin' => '\ensuremath{\infty}', 8734 => '\ensuremath{\infty}', - -=pod - -=item (8735-9830) - - - The items below require the isoent latex package which I can't find at least for FC5. - Temporarily commented out. + # The items below require the isoent latex package which I can't find at least for FC5. + # Temporarily commented out. 'ang90' => '\ensuremath{\sqangle}', 8735 => '\ensuremath{\sqangle}', -=cut - 'ang' => '\ensuremath{\angle}', 8736 => '\ensuremath{\angle}', 'angmsd' => '\ensuremath{\measuredangle}', @@ -929,7 +789,10 @@ my %entities = ( 'cong' => '\ensuremath{\cong}', 8773 => '\ensuremath{\cong}', 8775 => '\ensuremath{\ncong}', + 8776 => '\ensuremath{\approx}', + 'approx' => '\ensuremath{\approx}', 8778 => '\ensuremath{\approxeq}', + 'approxeq' => '\ensuremath{\approxeq}', 8784 => '\ensuremath{\doteq}', 8785 => '\ensuremath{\doteqdot}', 8786 => '\ensuremath{\fallingdotseq}', @@ -1100,12 +963,16 @@ my %entities = ( =pod -=item * +=head1 UNICODE TABLE + +=over There are some named entities that don't have a good latex equivalent, these are converted to utf-8 via this table of entity name -> unicode number. +=back + =cut my %utf_table = ( @@ -1199,6 +1066,105 @@ __END__ =pod +=head1 NAME + +Apache::entities.pm + +=head1 SYNOPSIS + +This file contains a table driven entity-->latex converter. + +This is part of the LearningOnline Network with CAPA project +described at http://www.lon-capa.org. + +=head1 OVERVIEW + + +Assumptions: + The number of entities in a resource is small compared with the + number of possible entities that might be translated. + Therefore the strategy is to match a general entity pattern + &.+; over and over, pull out the match look it up in an entity -> tex hash + and do the replacement. + +In order to simplify the hash, the following reductions are done: + &#d+; have the &# and ; stripped and is converted to an int. + &#.+; have the &#x and ; stripped and is converted to an int as a hex + value. + All others have the & and ; stripped. + + +The hash: Add new conversions here; leave off the leading & and the trailing ; +all numeric entities need only appear as their decimal versions +(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. + +This entity table is mercilessly cribbed from the HTML pocket reference +table starting at pg 82. In most cases the LaTeX equivalent codes come from +the original massive regular expression replacements originally by +A. Sakharuk in lonprintout.pm + +I also want to acknowledge + ISO Character entities and their LaTeX equivalents by + Vidar Bronken Gundersen, and Rune Mathisen + http://www.bitjungle.com/isoent-ref.pdf + + +Note numerical entities are essentially unicode character codes. + + +=head1 SUBROUTINES + +=over + +=item entity_to_utf8() + + +Convert a numerical entity (that does not exist in our hash) + to its UTF-8 equivalent representation. + This allows us to support, to some extent, any entity for which + dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). + +Parameters: + unicode - The unicode for the character. This is assumed to + be a decimal value +Returns: + The UTF-8 equiavalent of the value. + +=item entity_to_latex() + + Convert an entity to the corresponding LateX if possible. + If not possible, and the entity is numeric, + the entity is treated like a Unicode character and converted + to UTF-8 which should display as long as dvipdf can find the + appropriate glyph. + + The entity is assumed to have already had the + &; or & ; removed + +Parameters: + entity - Name of entity to convert. +Returns: + One of the following: + - Latex string that produces the entity. + - UTF-8 equivalent of a numeric entity for which we don't have a latex string. + - ' ' for text entities for which there's no latex equivalent. + + +=item replace_entities() + + Convert all the entities in a string. + We locate all the entities, pass them into entity_to_latex and + and replace occurences in the input string. + The assumption is that there are few entities in any string/document + so this looping is not too bad. The advantage of looping vs. regexping is + that we now can use lookup tables for the translation in entity_to_latex above. + +Parameters: + input - Input string/document +Returns + input with entities replaced by latexable stuff (UTF-8 encodings or + latex control strings to produce the entity. + =back =cut