Diff for /loncom/interface/entities.pm between versions 1.9 and 1.22

version 1.9, 2008/04/28 10:33:35 version 1.22, 2014/10/14 18:30:23
Line 1 Line 1
 # The LearningOnline Network  # The LearningOnline Network
 # entity -> tex.  # entity -> tex.
 #  #
 #   # $Id$
 #  #
 # Copyright Michigan State University Board of Trustees  # Copyright Michigan State University Board of Trustees
 #  #
Line 25 Line 25
 # http://www.lon-capa.org/  # http://www.lon-capa.org/
 #  #
 #  #
   
 package Apache::entities;  package Apache::entities;
 use strict;  use strict;
 #  
 #   This file contains a table driven entity-->latex converter.  
 #  
 #  Assumptions:  
 #   The number of entities in a resource is small compared with the  
 #   number of possible entities that might be translated.  
 #   Therefore the strategy is to match a general entity pattern  
 #   &.+; over and over, pull out the match look it up in an entity -> tex hash  
 #   and do the replacement.  
 #  
 #  In order to simplify the hash, the following reductions are done:  
 #   &#d+; have the &# and ; stripped and is converted to an int.  
 #   &#.+; have the &#x and ; stripped and is converted to an int as a hex  
 #                             value.  
 #   All others have the & and ; stripped.  
   
   =pod
   
 #  The hash:  Add new conversions here; leave off the leading & and the trailing ;  =head1 TABLES ASCII code page
 #  all numeric entities need only appear as their decimal versions  
 #  (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well.  
 #  
 #  This entity table is mercilessly cribbed from the  HTML pocket reference  
 #  table starting at pg 82.  In most cases the LaTeX equivalent codes come from  
 #  the original massive regular expression replacements originally by   
 #  A. Sakharuk in lonprintout.pm  
 #  
 #  I also want to acknowledge  
 #   ISO Character entities and their LaTeX equivalents by   
 #      Vidar Bronken Gundersen, and Rune Mathisen  
 #    http://www.bitjungle.com/isoent-ref.pdf  
 #  
   
 #  Note numerical entities are essentially unicode character codes.  =over
 #  
 package Apache::entities;  
   
 my %entities = (  =item (7-13)
   
       Translation to empty strings
   
   =item (32-126)
   
       Translations to simple characters
   
   =item (130-140)
   
       Controls and Latin-1 supplement.  Note that some entities that have
       visible effect are not printing unicode characters.  Specifically
       ‚- 
   
   =item (145-156)
   
       There's a gap here in my entity table
   
   =item (159-255)
   
        Another short gap
   
   =item (295)
   
        hbar entity number comes from the unicode character:
        see e.g. http://www.unicode.org/charts/PDF/U0100.pdf
        ISO also documents a 'planck' entity.
   
   =item (338-376)
   
       Latin extended-A HTML 4.01 entities
   
   =item (402)
   
       Latin extended B HTML 4.01 entities
   
   =item (710 & 732)
   
       Spacing modifier letters
   
   =item (913-937)
   
       Greek uppercase (skipss 930)
   
   =item (945-982)
   
       Greek lowercase
   
     #  ---- ASCII code page: ----------------  =item (8194-8364)
   
       The general punctuation set
   
   =item (8462-8501)
   
       Letter like symbols
   
   =item (8592-8669)
   
       Arrows and then some (harpoons from Hon Kie).
   
   =item (8704-8734)
   
       Mathematical operators.
   
   =item (8735-9830)
   
       The items below require the isoent latex package which I can't find at least for FC5.
       Temporarily commented out.
   
   =back
   
   =cut
   
   my %entities = (
   
     # Translation to empty strings:      # Translation to empty strings:
   
Line 176  my %entities = ( Line 219  my %entities = (
     125      => '\}',      125      => '\}',
     126      => '\~',      126      => '\~',
   
     #   Controls and Latin-1 supplement.  Note that some entities that have      # Controls and Latin-1 supplement.
     #   visible effect are not printing unicode characters.  Specifically  
     #   ‚-   
   
     130     => ',',      130     => ',',
     131     => '\ensuremath{f}',      131     => '\ensuremath{f}',
Line 192  my %entities = ( Line 233  my %entities = (
     139     => '\ensuremath{<}',      139     => '\ensuremath{<}',
     140     => '{\OE}',      140     => '{\OE}',
           
     #  There's a gap here in my entity table      # There's a gap here in my entity table
   
     145     => '`',      145     => '`',
     146     => '\'',      146     => '\'',
Line 206  my %entities = ( Line 247  my %entities = (
     154     => '\v{s}',      154     => '\v{s}',
     155     => '\ensuremath{>}',      155     => '\ensuremath{>}',
     156     => '\oe ',      156     => '\oe ',
       
     # Another short gap:       # Another short gap:
   
     159     => '\"Y',      159     => '\"Y',
     160     => '~',      160     => '~',
Line 397  my %entities = ( Line 438  my %entities = (
     255     => '\\"{y}',      255     => '\\"{y}',
     'yuml'  => '\\"{y}',      'yuml'  => '\\"{y}',
   
     # hbar entity number comes from the unicode charater:  
     # see e.g. http://www.unicode.org/charts/PDF/U0100.pdf       # hbar entity number comes from the unicode character:
     # ISO also documents a 'planck' entity.  
   
     295     => '\ensuremath{\hbar}',      295     => '\ensuremath{\hbar}',
     'planck' => '\ensuremath{\hbar}',      'planck' => '\ensuremath{\hbar}',
Line 417  my %entities = ( Line 457  my %entities = (
     376      => '\\"{Y}',      376      => '\\"{Y}',
     'Yuml'   => '\\"{Y}',       'Yuml'   => '\\"{Y}', 
   
   
     # Latin extended B HTML 4.01 entities      # Latin extended B HTML 4.01 entities
   
     402      => '\ensuremath{f}',      402      => '\ensuremath{f}',
Line 466  my %entities = ( Line 505  my %entities = (
     'Pi'     => '\ensuremath{\Pi}',      'Pi'     => '\ensuremath{\Pi}',
     929      => '\ensuremath{\mathrm{P}}',      929      => '\ensuremath{\mathrm{P}}',
     'Rho'    => '\ensuremath{\mathrm{P}}',      'Rho'    => '\ensuremath{\mathrm{P}}',
      
     # Skips 930  
   
     931      => '\ensuremath{\Sigma}',      931      => '\ensuremath{\Sigma}',
     'Sigma'  => '\ensuremath{\Sigma}',      'Sigma'  => '\ensuremath{\Sigma}',
     932      => '\ensuremath{\mathrm{T}}',      932      => '\ensuremath{\mathrm{T}}',
Line 484  my %entities = ( Line 520  my %entities = (
     937      => '\ensuremath{\Omega}',      937      => '\ensuremath{\Omega}',
     'Omega'  => '\ensuremath{\Omega}',      'Omega'  => '\ensuremath{\Omega}',
   
   
     # Greek lowercase:      # Greek lowercase:
   
     945      => '\ensuremath{\alpha}',      945      => '\ensuremath{\alpha}',
Line 544  my %entities = ( Line 579  my %entities = (
     982      => '\ensuremath{\varpi}',      982      => '\ensuremath{\varpi}',
     'piv'    => '\ensuremath{\varpi}',      'piv'    => '\ensuremath{\varpi}',
   
       
     # The general punctuation set:      # The general punctuation set:
   
     8194,    => '\hspace{.5em}',      8194,    => '\hspace{.5em}',
Line 603  my %entities = ( Line 637  my %entities = (
     'euro'   => '\texteuro',      'euro'   => '\texteuro',
   
     # Letter like symbols      # Letter like symbols
    
           8462     => '\ensuremath{h}',
       'planckh' => '\ensuremath{h}',
       8463     => '\ensuremath{\hbar}',
       'hbar'   => '\ensuremath{\hbar}',   
     8472     => '\ensuremath{\wp}',      8472     => '\ensuremath{\wp}',
     'weierp' => '\ensuremath{\wp}',      'weierp' => '\ensuremath{\wp}',
     8465     => '\ensuremath{\Im}',      8465     => '\ensuremath{\Im}',
Line 618  my %entities = ( Line 655  my %entities = (
   
     # Arrows and then some (harpoons from Hon Kie).      # Arrows and then some (harpoons from Hon Kie).
   
     8592     => '\textleftarrow',      8592     => '\ensuremath{\leftarrow}',
     'larr'   => '\textleftarrow',      'larr'   => '\ensuremath{\leftarrow}',
     8593     => '\textuparrow',      8593     => '\ensuremath{\uparrow}',
     'uarr'   => '\textuparrow',      'uarr'   => '\ensuremath{\uparrow}',
     8594     => '\textrightarrow',      8594     => '\ensuremath{\rightarrow}',
     'rarr'   => '\textrightarrow',      'rarr'   => '\ensuremath{\rightarrow}',
     8595     => '\textdownarrow',      'rightarrow' => '\ensuremath{\rightarrow}',
     'darr'   => '\textdownarrow',      8595     => '\ensuremath{\downarrow}',
       'darr'   => '\ensuremath{\downarrow}',
     8596     => '\ensuremath{\leftrightarrow}',      8596     => '\ensuremath{\leftrightarrow}',
     'harr'   => '\ensuremath{\leftrightarrow}',      'harr'   => '\ensuremath{\leftrightarrow}',
     8598     => '\ensuremath{\nwarrow}',      8598     => '\ensuremath{\nwarrow}',
Line 662  my %entities = ( Line 700  my %entities = (
     8669     => '\ensuremath{\rightsquigarrow}',      8669     => '\ensuremath{\rightsquigarrow}',
     'rarrw'  => '\ensuremath{\rightsquigarrow}',      'rarrw'  => '\ensuremath{\rightsquigarrow}',
           
   
     # Mathematical operators.      # Mathematical operators.
   
           
     'forall' => '\ensuremath{\forall}',      'forall' => '\ensuremath{\forall}',
     8704     => '\ensuremath{\forall}',      8704     => '\ensuremath{\forall}',
Line 711  my %entities = ( Line 747  my %entities = (
     8733     => '\ensuremath{\propto}',      8733     => '\ensuremath{\propto}',
     'infin'  => '\ensuremath{\infty}',      'infin'  => '\ensuremath{\infty}',
     8734     => '\ensuremath{\infty}',      8734     => '\ensuremath{\infty}',
 #    'ang90'  => '\ensuremath{\sqangle}',  
 #    8735     => '\ensuremath{\sqangle}',      # The items below require the isoent latex package which I can't find at least for FC5.
       # Temporarily commented out.
       
       'ang90'  => '\ensuremath{\sqangle}',
       8735     => '\ensuremath{\sqangle}',
   
     'ang'    => '\ensuremath{\angle}',      'ang'    => '\ensuremath{\angle}',
     8736     => '\ensuremath{\angle}',      8736     => '\ensuremath{\angle}',
     'angmsd' => '\ensuremath{\measuredangle}',      'angmsd' => '\ensuremath{\measuredangle}',
Line 752  my %entities = ( Line 793  my %entities = (
     'cong'   => '\ensuremath{\cong}',      'cong'   => '\ensuremath{\cong}',
     8773     => '\ensuremath{\cong}',      8773     => '\ensuremath{\cong}',
     8775     => '\ensuremath{\ncong}',      8775     => '\ensuremath{\ncong}',
       8776     => '\ensuremath{\approx}',
       'approx' => '\ensuremath{\approx}', 
     8778     => '\ensuremath{\approxeq}',      8778     => '\ensuremath{\approxeq}',
       'approxeq' => '\ensuremath{\approxeq}',
     8784     => '\ensuremath{\doteq}',      8784     => '\ensuremath{\doteq}',
     8785     => '\ensuremath{\doteqdot}',      8785     => '\ensuremath{\doteqdot}',
     8786     => '\ensuremath{\fallingdotseq}',      8786     => '\ensuremath{\fallingdotseq}',
Line 921  my %entities = ( Line 965  my %entities = (
           
 );  );
   
 #   =pod
 #  Convert a numerical entity (that does not exist in our hash)  
 #  to its UTF-8 equivalent representation.  =head1 UNICODE TABLE
 #  This allows us to support, to some extent, any entity for which  
 #  dvipdf can find a gylph (given that LaTeX is now UTF-8 clean).  =over
 #  
 # Parameters:      There are some named entities that don't have a good
 #   unicode  - The unicode for the character.  This is assumed to      latex equivalent, these are converted to utf-8 via this table
 #              be a decimal value      of entity name -> unicode number.
 # Returns:  
 #   The UTF-8 equiavalent of the value.  =back
 #  
   =cut
   
   my  %utf_table = (
       'THORN'  => 222,
       'thorn'  => 254,
       'eth'    => 240,
       'hearts' => 9829
   );
   
 sub entity_to_utf8 {  sub entity_to_utf8 {
     my ($unicode) = @_;      my ($unicode) = @_;
       my $result =  pack("U", $unicode);
     return pack("U", $unicode);      return $result;
 }  }
   
   
 #  
 #  Convert an entity to the corresponding LateX if possible.  
 #  If not possible, and the entity is numeric,  
 #  the entity is treated like a Unicode character and converted  
 #  to UTF-8 which should display as long as dvipdf can find the  
 #  appropriate glyph.  
 #  
 #  The entity is assumed to have already had the   
 #  &# ;  or & ; removed  
 #  
 # Parameters:  
 #   entity    - Name of entity to convert.  
 # Returns:  
 #  One of the following:  
 #   - Latex string that produces the entity.  
 #   - UTF-8 equivalent of a numeric entity for which we don't have a latex string.  
 #   - ' ' for text entities for which there's no latex equivalent.  
 #  
 sub entity_to_latex {  sub entity_to_latex {
     my ($entity) = @_;      my ($entity) = @_;
   
     # Try to look up the entity (text or numeric) in the hash:      # Try to look up the entity (text or numeric) in the hash:
   
   
   
     my $latex = $entities{"$entity"};      my $latex = $entities{"$entity"};
     if (defined $latex) {      if (defined $latex) {
  return $latex;   return $latex;
     }      }
     # If the text is purely numeric we can do the UTF-8 conversion:      # If the text is purely numeric we can do the UTF-8 conversion:
       # Otherwise there are a few textual entities that don't have good latex
     if ($entity =~ /^\d$/) {      # which can be converted to unicode:
       #
       if ($entity =~ /^\d+$/) {
  return &entity_to_utf8($entity);   return &entity_to_utf8($entity);
       } else {
    my $result = $utf_table{"$entity"};
    if (defined $result) {
       return &entity_to_utf8($result);
    }
     }      }
     #  Can't do the conversion`< ...      #  Can't do the conversion`< ...
   
     return " ";      return " ";
 }  }
   
 #  
 #  Convert all the entities in a string.  
 #  We locate all the entities, pass them into entity_to_latex and   
 #  and replace occurences in the input string.  
 #  The assumption is that there are few entities in any string/document  
 #  so this looping is not too bad.  The advantage of looping vs. regexping is  
 #  that we now can use lookup tables for the translation in entity_to_latex above.  
 #  
 # Parameters:  
 #   input   - Input string/document  
 # Returns  
 #   input with entities replaced by latexable stuff (UTF-8 encodings or  
 #   latex control strings to produce the entity.  
 #  
 #  
 sub replace_entities {  sub replace_entities {
     my ($input)  = @_;      my ($input)  = @_;
     my $start;      my $start;
Line 1009  sub replace_entities { Line 1039  sub replace_entities {
  $latex = &entity_to_latex($entity);   $latex = &entity_to_latex($entity);
  substr($input, $start, $end-$start) = $latex;   substr($input, $start, $end-$start) = $latex;
     }      }
   
       # Hexadecimal entities:
   
       while ($input =~ /&\#x(\d|[a-f,A-f])+;/) {
    ($start) = @-;
    ($end)   = @+;
    $entity  = "0" . substr($input, $start+2, $end-$start-3); # 0xhexnumber
    $latex = &entity_to_latex(hex($entity));
    substr($input, $start, $end-$start) = $latex;
       }
   
   
     # Now the &text; entites;      # Now the &text; entites;
           
     while ($input =~/(&\w+;)/) {      while ($input =~/(&\w+;)/) {
Line 1025  sub replace_entities { Line 1067  sub replace_entities {
 1;   1; 
   
 __END__  __END__
   
   =pod
   
   =head1 NAME
   
   Apache::entities.pm
   
   =head1 SYNOPSIS
   
   This file contains a table driven entity-->latex converter.
   
   This is part of the LearningOnline Network with CAPA project
   described at http://www.lon-capa.org.
   
   =head1 OVERVIEW
   
   
   Assumptions:
    The number of entities in a resource is small compared with the
    number of possible entities that might be translated.
    Therefore the strategy is to match a general entity pattern
    &.+; over and over, pull out the match look it up in an entity -> tex hash
    and do the replacement.
   
   In order to simplify the hash, the following reductions are done:
    &#d+; have the &# and ; stripped and is converted to an int.
    &#.+; have the &#x and ; stripped and is converted to an int as a hex
                              value.
    All others have the & and ; stripped.
   
   
   The hash:  Add new conversions here; leave off the leading & and the trailing ;
   all numeric entities need only appear as their decimal versions
   (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well.
   
   This entity table is mercilessly cribbed from the  HTML pocket reference
   table starting at pg 82.  In most cases the LaTeX equivalent codes come from
   the original massive regular expression replacements originally by 
   A. Sakharuk in lonprintout.pm
   
   I also want to acknowledge
    ISO Character entities and their LaTeX equivalents by 
       Vidar Bronken Gundersen, and Rune Mathisen
     http://www.bitjungle.com/isoent-ref.pdf
   
   
   Note numerical entities are essentially unicode character codes.
   
   
   =head1 SUBROUTINES
   
   =over
   
   =item entity_to_utf8()
   
   
   Convert a numerical entity (that does not exist in our hash)
    to its UTF-8 equivalent representation.
    This allows us to support, to some extent, any entity for which
    dvipdf can find a gylph (given that LaTeX is now UTF-8 clean).
   
   Parameters:
     unicode  - The unicode for the character.  This is assumed to
                be a decimal value
   Returns:
     The UTF-8 equiavalent of the value.
   
   =item entity_to_latex()
   
    Convert an entity to the corresponding LateX if possible.
    If not possible, and the entity is numeric,
    the entity is treated like a Unicode character and converted
    to UTF-8 which should display as long as dvipdf can find the
    appropriate glyph.
   
    The entity is assumed to have already had the 
    &;  or & ; removed
   
   Parameters:
     entity    - Name of entity to convert.
   Returns:
    One of the following:
     - Latex string that produces the entity.
     - UTF-8 equivalent of a numeric entity for which we don't have a latex string.
     - ' ' for text entities for which there's no latex equivalent.
   
   
   =item replace_entities()
   
    Convert all the entities in a string.
    We locate all the entities, pass them into entity_to_latex and 
    and replace occurences in the input string.
    The assumption is that there are few entities in any string/document
    so this looping is not too bad.  The advantage of looping vs. regexping is
    that we now can use lookup tables for the translation in entity_to_latex above.
   
   Parameters:
     input   - Input string/document
   Returns
     input with entities replaced by latexable stuff (UTF-8 encodings or
     latex control strings to produce the entity.
   
   =back
   
   =cut

Removed from v.1.9  
changed lines
  Added in v.1.22


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>