Diff for /loncom/interface/entities.pm between versions 1.12.2.1 and 1.13

version 1.12.2.1, 2008/12/11 14:03:14 version 1.13, 2008/11/17 13:52:39
Line 1 Line 1
 # The LearningOnline Network  # The LearningOnline Network
 # entity -> tex.  # entity -> tex.
 #  #
 # $Id$  # 
 #  #
 # Copyright Michigan State University Board of Trustees  # Copyright Michigan State University Board of Trustees
 #  #
Line 25 Line 25
 # http://www.lon-capa.org/  # http://www.lon-capa.org/
 #  #
 #  #
 package Apache::entities;  
 use strict;  
 #  
 #   This file contains a table driven entity-->latex converter.  
 #  
 #  Assumptions:  
 #   The number of entities in a resource is small compared with the  
 #   number of possible entities that might be translated.  
 #   Therefore the strategy is to match a general entity pattern  
 #   &.+; over and over, pull out the match look it up in an entity -> tex hash  
 #   and do the replacement.  
 #  
 #  In order to simplify the hash, the following reductions are done:  
 #   &#d+; have the &# and ; stripped and is converted to an int.  
 #   &#.+; have the &#x and ; stripped and is converted to an int as a hex  
 #                             value.  
 #   All others have the & and ; stripped.  
   
   
 #  The hash:  Add new conversions here; leave off the leading & and the trailing ;  =head1 NAME
 #  all numeric entities need only appear as their decimal versions  
 #  (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well.  Apache::entities.pm
 #  
 #  This entity table is mercilessly cribbed from the  HTML pocket reference  =head1 SYNOPSIS
 #  table starting at pg 82.  In most cases the LaTeX equivalent codes come from  
 #  the original massive regular expression replacements originally by   This file contains a table driven entity-->latex converter.
 #  A. Sakharuk in lonprintout.pm  
 #  This is part of the LearningOnline Network with CAPA project
 #  I also want to acknowledge  described at http://www.lon-capa.org.
 #   ISO Character entities and their LaTeX equivalents by   
 #      Vidar Bronken Gundersen, and Rune Mathisen  =head1 OVERVIEW
 #    http://www.bitjungle.com/isoent-ref.pdf  
 #  
   Assumptions:
    The number of entities in a resource is small compared with the
    number of possible entities that might be translated.
    Therefore the strategy is to match a general entity pattern
    &.+; over and over, pull out the match look it up in an entity -> tex hash
    and do the replacement.
   
   In order to simplify the hash, the following reductions are done:
    &#d+; have the &# and ; stripped and is converted to an int.
    &#.+; have the &#x and ; stripped and is converted to an int as a hex
                              value.
    All others have the & and ; stripped.
   
   
   The hash:  Add new conversions here; leave off the leading & and the trailing ;
   all numeric entities need only appear as their decimal versions
   (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well.
   
   This entity table is mercilessly cribbed from the  HTML pocket reference
   table starting at pg 82.  In most cases the LaTeX equivalent codes come from
   the original massive regular expression replacements originally by 
   A. Sakharuk in lonprintout.pm
   
   I also want to acknowledge
    ISO Character entities and their LaTeX equivalents by 
       Vidar Bronken Gundersen, and Rune Mathisen
     http://www.bitjungle.com/isoent-ref.pdf
   
   
   Note numerical entities are essentially unicode character codes.
   
   
   =head1 SUBROUTINES
   
   =item entity_to_utf8()
   
   
   Convert a numerical entity (that does not exist in our hash)
    to its UTF-8 equivalent representation.
    This allows us to support, to some extent, any entity for which
    dvipdf can find a gylph (given that LaTeX is now UTF-8 clean).
   
   Parameters:
     unicode  - The unicode for the character.  This is assumed to
                be a decimal value
   Returns:
     The UTF-8 equiavalent of the value.
   
   =item entity_to_latex()
   
    Convert an entity to the corresponding LateX if possible.
    If not possible, and the entity is numeric,
    the entity is treated like a Unicode character and converted
    to UTF-8 which should display as long as dvipdf can find the
    appropriate glyph.
   
    The entity is assumed to have already had the 
    &;  or & ; removed
   
   Parameters:
     entity    - Name of entity to convert.
   Returns:
    One of the following:
     - Latex string that produces the entity.
     - UTF-8 equivalent of a numeric entity for which we don't have a latex string.
     - ' ' for text entities for which there's no latex equivalent.
   
   
   =item replace_entities()
   
    Convert all the entities in a string.
    We locate all the entities, pass them into entity_to_latex and 
    and replace occurences in the input string.
    The assumption is that there are few entities in any string/document
    so this looping is not too bad.  The advantage of looping vs. regexping is
    that we now can use lookup tables for the translation in entity_to_latex above.
   
   Parameters:
     input   - Input string/document
   Returns
     input with entities replaced by latexable stuff (UTF-8 encodings or
     latex control strings to produce the entity.
   
   =head1 TABLES ASCII code page
   
   =cut
   
   
   package Apache::entities;
   use strict;
   
 #  Note numerical entities are essentially unicode character codes.  
 #  
 package Apache::entities;  package Apache::entities;
   
 my %entities = (  my %entities = (
   
     #  ---- ASCII code page: ----------------  =pod
   
   =out
   
   =item (7-13)
   
     # Translation to empty strings:      # Translation to empty strings:
   =cut
   
     7        => "",      7        => "",
     9        => "",      9        => "",
     10       => "",      10       => "",
     13       => "",      13       => "",
           
   =pod
   
   =item (32-126)
   
     # Translations to simple characters:      # Translations to simple characters:
   
   =cut
   
     32       => " ",      32       => " ",
     33       => "!",      33       => "!",
     34       => '"',      34       => '"',
Line 176  my %entities = ( Line 258  my %entities = (
     125      => '\}',      125      => '\}',
     126      => '\~',      126      => '\~',
   
     #   Controls and Latin-1 supplement.  Note that some entities that have  =pod
     #   visible effect are not printing unicode characters.  Specifically  
     #   ‚-   =item (130-140)
   
       Controls and Latin-1 supplement.  Note that some entities that have
       visible effect are not printing unicode characters.  Specifically
       ‚- 
   
   =cut
   
     130     => ',',      130     => ',',
     131     => '\ensuremath{f}',      131     => '\ensuremath{f}',
Line 192  my %entities = ( Line 280  my %entities = (
     139     => '\ensuremath{<}',      139     => '\ensuremath{<}',
     140     => '{\OE}',      140     => '{\OE}',
           
     #  There's a gap here in my entity table  =pod
   
   =item (145-156)
   
       There's a gap here in my entity table
   
   =cut
   
     145     => '`',      145     => '`',
     146     => '\'',      146     => '\'',
Line 206  my %entities = ( Line 300  my %entities = (
     154     => '\v{s}',      154     => '\v{s}',
     155     => '\ensuremath{>}',      155     => '\ensuremath{>}',
     156     => '\oe ',      156     => '\oe ',
       
     # Another short gap:  =pod
   
   =item (159-255)
   
        Another short gap:
   
   =cut
   
     159     => '\"Y',      159     => '\"Y',
     160     => '~',      160     => '~',
Line 397  my %entities = ( Line 497  my %entities = (
     255     => '\\"{y}',      255     => '\\"{y}',
     'yuml'  => '\\"{y}',      'yuml'  => '\\"{y}',
   
     # hbar entity number comes from the unicode charater:  
     # see e.g. http://www.unicode.org/charts/PDF/U0100.pdf  =pod
     # ISO also documents a 'planck' entity.  
   =item (295)
   
        hbar entity number comes from the unicode charater:
        see e.g. http://www.unicode.org/charts/PDF/U0100.pdf
        ISO also documents a 'planck' entity.
   
   =cut
   
     295     => '\ensuremath{\hbar}',      295     => '\ensuremath{\hbar}',
     'planck' => '\ensuremath{\hbar}',      'planck' => '\ensuremath{\hbar}',
   
     # Latin extended-A HTML 4.01 entities:  =pod
   
   =item (338-376)
   
       Latin extended-A HTML 4.01 entities:
   
   =cut
   
     338      => '\OE',      338      => '\OE',
     'OElig'  => '\OE',      'OElig'  => '\OE',
Line 417  my %entities = ( Line 530  my %entities = (
     376      => '\\"{Y}',      376      => '\\"{Y}',
     'Yuml'   => '\\"{Y}',       'Yuml'   => '\\"{Y}', 
   
   =pod
   
   =item (402)
   
     # Latin extended B HTML 4.01 entities      Latin extended B HTML 4.01 entities
   
   =cut
   
     402      => '\ensuremath{f}',      402      => '\ensuremath{f}',
     'fnof'   => '\ensuremath{f}',      'fnof'   => '\ensuremath{f}',
   
     # Spacing modifier letters:  =pod
   
   =item (710 & 732)
   
       Spacing modifier letters:
   
   =cut
           
     710      => '\^{}',      710      => '\^{}',
     'circ'   => '\^{}',      'circ'   => '\^{}',
     732      => '\~{}',      732      => '\~{}',
     'tilde'  => '\~{}',      'tilde'  => '\~{}',
   
     # Greek uppercase:  =pod
   
   =item (913-929)
   
       Greek uppercase:
   
   =cut
   
     913      => '\ensuremath{\mathrm{A}}',      913      => '\ensuremath{\mathrm{A}}',
     'Alpha'  => '\ensuremath{\mathrm{A}}',      'Alpha'  => '\ensuremath{\mathrm{A}}',
Line 467  my %entities = ( Line 597  my %entities = (
     929      => '\ensuremath{\mathrm{P}}',      929      => '\ensuremath{\mathrm{P}}',
     'Rho'    => '\ensuremath{\mathrm{P}}',      'Rho'    => '\ensuremath{\mathrm{P}}',
         
     # Skips 930  
   =pod
   
   =item (931-937)
   
       Skips 930
   
   =cut
   
     931      => '\ensuremath{\Sigma}',      931      => '\ensuremath{\Sigma}',
     'Sigma'  => '\ensuremath{\Sigma}',      'Sigma'  => '\ensuremath{\Sigma}',
Line 484  my %entities = ( Line 621  my %entities = (
     937      => '\ensuremath{\Omega}',      937      => '\ensuremath{\Omega}',
     'Omega'  => '\ensuremath{\Omega}',      'Omega'  => '\ensuremath{\Omega}',
   
   =pod
   
     # Greek lowercase:  =item (945-982)
   
       Greek lowercase:
   
   =cut
   
     945      => '\ensuremath{\alpha}',      945      => '\ensuremath{\alpha}',
     'alpha'  => '\ensuremath{\alpha}',      'alpha'  => '\ensuremath{\alpha}',
Line 544  my %entities = ( Line 686  my %entities = (
     982      => '\ensuremath{\varpi}',      982      => '\ensuremath{\varpi}',
     'piv'    => '\ensuremath{\varpi}',      'piv'    => '\ensuremath{\varpi}',
   
   =pod
   
   =item (8194-8364)
           
     # The general punctuation set:      The general punctuation set:
   
   =cut
   
     8194,    => '\hspace{.5em}',      8194,    => '\hspace{.5em}',
     'enspc'  => '\hspace{.5em}',      'enspc'  => '\hspace{.5em}',
Line 602  my %entities = ( Line 749  my %entities = (
     8364     => '\texteuro',      8364     => '\texteuro',
     'euro'   => '\texteuro',      'euro'   => '\texteuro',
   
     # Letter like symbols  =pod
   
   =item (8472-8501)
   
       Letter like symbols
   
   =cut
   
           
     8472     => '\ensuremath{\wp}',      8472     => '\ensuremath{\wp}',
Line 616  my %entities = ( Line 769  my %entities = (
     8501     => '\ensuremath{\aleph}',      8501     => '\ensuremath{\aleph}',
     'alefsym'=> '\ensuremath{\aleph}',      'alefsym'=> '\ensuremath{\aleph}',
   
     # Arrows and then some (harpoons from Hon Kie).  =pod
   
   =item (8592-8669)
       
       Arrows and then some (harpoons from Hon Kie).
   
   =cut
   
     8592     => '\textleftarrow',      8592     => '\textleftarrow',
     'larr'   => '\textleftarrow',      'larr'   => '\textleftarrow',
Line 662  my %entities = ( Line 821  my %entities = (
     8669     => '\ensuremath{\rightsquigarrow}',      8669     => '\ensuremath{\rightsquigarrow}',
     'rarrw'  => '\ensuremath{\rightsquigarrow}',      'rarrw'  => '\ensuremath{\rightsquigarrow}',
           
   =pod
   
   =item (8704-8734)
   
     # Mathematical operators.      Mathematical operators.
   
   =cut
   
           
     'forall' => '\ensuremath{\forall}',      'forall' => '\ensuremath{\forall}',
Line 711  my %entities = ( Line 875  my %entities = (
     8733     => '\ensuremath{\propto}',      8733     => '\ensuremath{\propto}',
     'infin'  => '\ensuremath{\infty}',      'infin'  => '\ensuremath{\infty}',
     8734     => '\ensuremath{\infty}',      8734     => '\ensuremath{\infty}',
 #  
 #   The items below require the isoent latex package which I can't find at least for FC5.  
 #   Temporarily commented out.  =pod
 #  
 #    'ang90'  => '\ensuremath{\sqangle}',  =item (8735-9830)
 #    8735     => '\ensuremath{\sqangle}',  
   
       The items below require the isoent latex package which I can't find at least for FC5.
       Temporarily commented out.
       
       'ang90'  => '\ensuremath{\sqangle}',
       8735     => '\ensuremath{\sqangle}',
   
   =cut
   
     'ang'    => '\ensuremath{\angle}',      'ang'    => '\ensuremath{\angle}',
     8736     => '\ensuremath{\angle}',      8736     => '\ensuremath{\angle}',
     'angmsd' => '\ensuremath{\measuredangle}',      'angmsd' => '\ensuremath{\measuredangle}',
Line 925  my %entities = ( Line 1098  my %entities = (
           
 );  );
   
 #  There are some named entities that don't have a good  =pod
 #  latex equivalent, these are converted to utf-8 via this table  
 #  of entity name -> unicode number.  =item *
   
       There are some named entities that don't have a good
       latex equivalent, these are converted to utf-8 via this table
       of entity name -> unicode number.
   
   =cut
   
 my  %utf_table = (  my  %utf_table = (
     'THORN'  => 222,      'THORN'  => 222,
Line 936  my  %utf_table = ( Line 1115  my  %utf_table = (
     'hearts' => 9829      'hearts' => 9829
 );  );
   
 #   
 #  Convert a numerical entity (that does not exist in our hash)  
 #  to its UTF-8 equivalent representation.  
 #  This allows us to support, to some extent, any entity for which  
 #  dvipdf can find a gylph (given that LaTeX is now UTF-8 clean).  
 #  
 # Parameters:  
 #   unicode  - The unicode for the character.  This is assumed to  
 #              be a decimal value  
 # Returns:  
 #   The UTF-8 equiavalent of the value.  
 #  
 sub entity_to_utf8 {  sub entity_to_utf8 {
     my ($unicode) = @_;      my ($unicode) = @_;
     my $result =  pack("U", $unicode);      my $result =  pack("U", $unicode);
Line 955  sub entity_to_utf8 { Line 1122  sub entity_to_utf8 {
 }  }
   
   
 #  
 #  Convert an entity to the corresponding LateX if possible.  
 #  If not possible, and the entity is numeric,  
 #  the entity is treated like a Unicode character and converted  
 #  to UTF-8 which should display as long as dvipdf can find the  
 #  appropriate glyph.  
 #  
 #  The entity is assumed to have already had the   
 #  &# ;  or & ; removed  
 #  
 # Parameters:  
 #   entity    - Name of entity to convert.  
 # Returns:  
 #  One of the following:  
 #   - Latex string that produces the entity.  
 #   - UTF-8 equivalent of a numeric entity for which we don't have a latex string.  
 #   - ' ' for text entities for which there's no latex equivalent.  
 #  
 sub entity_to_latex {  sub entity_to_latex {
     my ($entity) = @_;      my ($entity) = @_;
   
Line 1001  sub entity_to_latex { Line 1151  sub entity_to_latex {
     return " ";      return " ";
 }  }
   
 #  
 #  Convert all the entities in a string.  
 #  We locate all the entities, pass them into entity_to_latex and   
 #  and replace occurences in the input string.  
 #  The assumption is that there are few entities in any string/document  
 #  so this looping is not too bad.  The advantage of looping vs. regexping is  
 #  that we now can use lookup tables for the translation in entity_to_latex above.  
 #  
 # Parameters:  
 #   input   - Input string/document  
 # Returns  
 #   input with entities replaced by latexable stuff (UTF-8 encodings or  
 #   latex control strings to produce the entity.  
 #  
 #  
 sub replace_entities {  sub replace_entities {
     my ($input)  = @_;      my ($input)  = @_;
     my $start;      my $start;
Line 1060  sub replace_entities { Line 1196  sub replace_entities {
 1;   1; 
   
 __END__  __END__
   
   =pod
   
   =back
   
   =cut

Removed from v.1.12.2.1  
changed lines
  Added in v.1.13


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>